From e30f2056f2ad264c77d1ba49120e5522d7c6b0e0 Mon Sep 17 00:00:00 2001 From: David Potter Date: Fri, 8 Aug 2025 11:31:16 -0700 Subject: [PATCH 01/19] email date format flexibility --- example-docs/eml/test-iso-8601-date.eml | 6 ++++++ example-docs/eml/test-rfc2822-date.eml | 6 ++++++ test_unstructured/partition/test_email.py | 17 ++++++++++++++++- unstructured/partition/email.py | 12 +++++++++--- 4 files changed, 37 insertions(+), 4 deletions(-) create mode 100644 example-docs/eml/test-iso-8601-date.eml create mode 100644 example-docs/eml/test-rfc2822-date.eml diff --git a/example-docs/eml/test-iso-8601-date.eml b/example-docs/eml/test-iso-8601-date.eml new file mode 100644 index 0000000000..82f021c1bd --- /dev/null +++ b/example-docs/eml/test-iso-8601-date.eml @@ -0,0 +1,6 @@ +Date: 2025-07-29T12:42:06.000Z +From: sender@example.com +To: recipient@example.com +Subject: Test a Z-suffix date + +This is a test-email. diff --git a/example-docs/eml/test-rfc2822-date.eml b/example-docs/eml/test-rfc2822-date.eml new file mode 100644 index 0000000000..3950640b62 --- /dev/null +++ b/example-docs/eml/test-rfc2822-date.eml @@ -0,0 +1,6 @@ +Date: Tue, 29 Jul 2025 12:42:06 +0000 +From: sender@example.com +To: recipient@example.com +Subject: Test a standard RFC-2822 date + +This is a test-email. diff --git a/test_unstructured/partition/test_email.py b/test_unstructured/partition/test_email.py index 5d5937d3a4..b87e4816ef 100644 --- a/test_unstructured/partition/test_email.py +++ b/test_unstructured/partition/test_email.py @@ -521,10 +521,25 @@ def it_uses_the_metadata_last_modified_arg_value_when_one_was_provided(self): ctx = EmailPartitioningContext(metadata_last_modified=metadata_last_modified) assert ctx.metadata_last_modified == metadata_last_modified - def and_it_uses_the_msg_Date_header_date_when_metadata_last_modified_was_not_provided(self): + def and_it_uses_the_msg_Date_header_date_when_metadata_last_modified_was_not_provided( + self, + ): ctx = EmailPartitioningContext(example_doc_path("eml/simple-rfc-822.eml")) assert ctx.metadata_last_modified == "2024-10-01T17:34:56+00:00" + @pytest.mark.parametrize( + ("date_format", "expected_date"), + [ + ("test-iso-8601-date.eml", "2025-07-29T12:42:06+00:00"), + ("test-rfc2822-date.eml", "2025-07-29T12:42:06+00:00"), + ], + ) + def and_it_correctly_parses_various_date_formats_like_the_ones_that_occur_in_the_wild( + self, date_format: str, expected_date: str + ): + ctx = EmailPartitioningContext(example_doc_path(f"eml/{date_format}")) + assert ctx.metadata_last_modified == expected_date + def and_it_falls_back_to_filesystem_last_modified_when_no_Date_header_is_present( self, get_last_modified_date_: Mock ): diff --git a/unstructured/partition/email.py b/unstructured/partition/email.py index d194e22d21..c0c8e7ab40 100644 --- a/unstructured/partition/email.py +++ b/unstructured/partition/email.py @@ -14,6 +14,8 @@ from email.message import EmailMessage, MIMEPart from typing import IO, Any, Final, Iterator, cast +from dateutil import parser + from unstructured.documents.elements import Element, ElementMetadata from unstructured.file_utils.model import FileType from unstructured.partition.common import UnsupportedFileFormatError @@ -279,7 +281,11 @@ def _sent_date(self) -> str | None: date_str = self.msg.get("Date") if not date_str: return None - sent_date = email.utils.parsedate_to_datetime(date_str) + try: + sent_date = parser.parse(date_str) + except (parser.ParserError, TypeError, ValueError): + return None + return sent_date.astimezone(dt.timezone.utc).isoformat(timespec="seconds") def _validate(self) -> EmailPartitioningContext: @@ -365,13 +371,13 @@ def _iter_email_body_elements(self) -> Iterator[Element]: class _AttachmentPartitioner: """Partitions an attachment to a MSG file.""" - def __init__(self, attachment: EmailMessage, ctx: EmailPartitioningContext): + def __init__(self, attachment: MIMEPart, ctx: EmailPartitioningContext): self._attachment = attachment self._ctx = ctx @classmethod def iter_elements( - cls, attachment: EmailMessage, ctx: EmailPartitioningContext + cls, attachment: MIMEPart, ctx: EmailPartitioningContext ) -> Iterator[Element]: """Partition an attachment MIME-part from a MIME email message (.eml file).""" return cls(attachment, ctx)._iter_elements() From a30dada6cf2b86f07774e1f8bab2ae5996796048 Mon Sep 17 00:00:00 2001 From: David Potter Date: Fri, 8 Aug 2025 11:46:31 -0700 Subject: [PATCH 02/19] changelog --- CHANGELOG.md | 10 ++++++++++ unstructured/__version__.py | 2 +- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index fadc442990..37a1172ae9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,13 @@ +## 0.18.13 + +### Enhancements + +### Features + +### Fixes + +- **Parse a wider variety of date formats in email headers** The `partition_email` function is now more robust to non-standard date formats, including ISO-8601 dates with "Z" suffixes. This prevents `ValueError` exceptions when partitioning emails with these date formats. + ## 0.18.12 ### Enhancements diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 7949610e2a..7774420d99 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.18.12" # pragma: no cover +__version__ = "0.18.13" # pragma: no cover From 706284581f8b521124b49c3e9a938d2b200449be Mon Sep 17 00:00:00 2001 From: David Potter Date: Fri, 8 Aug 2025 15:15:23 -0700 Subject: [PATCH 03/19] reset type hint that was strange --- unstructured/partition/email.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/unstructured/partition/email.py b/unstructured/partition/email.py index c0c8e7ab40..cb6b74d093 100644 --- a/unstructured/partition/email.py +++ b/unstructured/partition/email.py @@ -371,13 +371,13 @@ def _iter_email_body_elements(self) -> Iterator[Element]: class _AttachmentPartitioner: """Partitions an attachment to a MSG file.""" - def __init__(self, attachment: MIMEPart, ctx: EmailPartitioningContext): + def __init__(self, attachment: EmailMessage, ctx: EmailPartitioningContext): self._attachment = attachment self._ctx = ctx @classmethod def iter_elements( - cls, attachment: MIMEPart, ctx: EmailPartitioningContext + cls, attachment: EmailMessage, ctx: EmailPartitioningContext ) -> Iterator[Element]: """Partition an attachment MIME-part from a MIME email message (.eml file).""" return cls(attachment, ctx)._iter_elements() From 4d3e840a7356d2536950ff746708f584cedf6fab Mon Sep 17 00:00:00 2001 From: David Potter Date: Mon, 11 Aug 2025 07:21:03 -0700 Subject: [PATCH 04/19] add invalid date format test --- example-docs/eml/test-invalid-date.eml | 6 ++++++ test_unstructured/partition/test_email.py | 4 ++++ 2 files changed, 10 insertions(+) create mode 100644 example-docs/eml/test-invalid-date.eml diff --git a/example-docs/eml/test-invalid-date.eml b/example-docs/eml/test-invalid-date.eml new file mode 100644 index 0000000000..957b0c6df6 --- /dev/null +++ b/example-docs/eml/test-invalid-date.eml @@ -0,0 +1,6 @@ +Date: INVALID-DATE-FORMAT +From: sender@example.com +To: recipient@example.com +Subject: Test invalid date format + +This is a test-email with an invalid date format. \ No newline at end of file diff --git a/test_unstructured/partition/test_email.py b/test_unstructured/partition/test_email.py index b87e4816ef..4efe5944a1 100644 --- a/test_unstructured/partition/test_email.py +++ b/test_unstructured/partition/test_email.py @@ -540,6 +540,10 @@ def and_it_correctly_parses_various_date_formats_like_the_ones_that_occur_in_the ctx = EmailPartitioningContext(example_doc_path(f"eml/{date_format}")) assert ctx.metadata_last_modified == expected_date + def and_it_returns_none_when_date_header_is_invalid(self): + ctx = EmailPartitioningContext(example_doc_path("eml/test-invalid-date.eml")) + assert ctx._sent_date is None + def and_it_falls_back_to_filesystem_last_modified_when_no_Date_header_is_present( self, get_last_modified_date_: Mock ): From 224a6b2112114f52a31731046db8948a17652a8b Mon Sep 17 00:00:00 2001 From: David Potter Date: Mon, 11 Aug 2025 08:39:53 -0700 Subject: [PATCH 05/19] add comment for shellcheck --- scripts/docker-smoke-test.sh | 1 + test_unstructured_ingest/src/azure.sh | 2 ++ test_unstructured_ingest/src/google-drive.sh | 2 ++ test_unstructured_ingest/src/kafka-local.sh | 2 ++ .../src/local-single-file-basic-chunking.sh | 2 ++ .../src/local-single-file-chunk-no-orig-elements.sh | 2 ++ test_unstructured_ingest/src/local-single-file-with-encoding.sh | 2 ++ .../src/local-single-file-with-pdf-infer-table-structure.sh | 2 ++ test_unstructured_ingest/src/local-single-file.sh | 2 ++ test_unstructured_ingest/src/s3-minio.sh | 2 ++ test_unstructured_ingest/src/s3.sh | 2 ++ test_unstructured_ingest/src/sharepoint.sh | 2 ++ 12 files changed, 23 insertions(+) diff --git a/scripts/docker-smoke-test.sh b/scripts/docker-smoke-test.sh index 0e66e05ae4..f2c763f0b0 100755 --- a/scripts/docker-smoke-test.sh +++ b/scripts/docker-smoke-test.sh @@ -3,6 +3,7 @@ # Start the containerized repository and run ingest tests # shellcheck disable=SC2317 # Shellcheck complains that trap functions are unreachable... +# shellcheck disable=SC2329 # Functions are invoked indirectly set -eux -o pipefail diff --git a/test_unstructured_ingest/src/azure.sh b/test_unstructured_ingest/src/azure.sh index 9c64353e9a..0480d3cd02 100755 --- a/test_unstructured_ingest/src/azure.sh +++ b/test_unstructured_ingest/src/azure.sh @@ -1,5 +1,7 @@ #!/usr/bin/env bash +# shellcheck disable=SC2329 # Functions are invoked indirectly + set -e SRC_PATH=$(dirname "$(realpath "$0")") diff --git a/test_unstructured_ingest/src/google-drive.sh b/test_unstructured_ingest/src/google-drive.sh index a1bc46d3a4..35017077e0 100755 --- a/test_unstructured_ingest/src/google-drive.sh +++ b/test_unstructured_ingest/src/google-drive.sh @@ -1,5 +1,7 @@ #!/usr/bin/env bash +# shellcheck disable=SC2329 # Functions are invoked indirectly + set -e SRC_PATH=$(dirname "$(realpath "$0")") diff --git a/test_unstructured_ingest/src/kafka-local.sh b/test_unstructured_ingest/src/kafka-local.sh index 9e78fba544..dc86099fff 100755 --- a/test_unstructured_ingest/src/kafka-local.sh +++ b/test_unstructured_ingest/src/kafka-local.sh @@ -1,5 +1,7 @@ #!/usr/bin/env bash +# shellcheck disable=SC2329 # Functions are invoked indirectly + set -e SRC_PATH=$(dirname "$(realpath "$0")") diff --git a/test_unstructured_ingest/src/local-single-file-basic-chunking.sh b/test_unstructured_ingest/src/local-single-file-basic-chunking.sh index 12da9e1dde..d59cbfa838 100755 --- a/test_unstructured_ingest/src/local-single-file-basic-chunking.sh +++ b/test_unstructured_ingest/src/local-single-file-basic-chunking.sh @@ -1,5 +1,7 @@ #!/usr/bin/env bash +# shellcheck disable=SC2329 # Functions are invoked indirectly + set -e SRC_PATH=$(dirname "$(realpath "$0")") diff --git a/test_unstructured_ingest/src/local-single-file-chunk-no-orig-elements.sh b/test_unstructured_ingest/src/local-single-file-chunk-no-orig-elements.sh index fc8b0a41df..05119980e9 100755 --- a/test_unstructured_ingest/src/local-single-file-chunk-no-orig-elements.sh +++ b/test_unstructured_ingest/src/local-single-file-chunk-no-orig-elements.sh @@ -6,6 +6,8 @@ # option which otherwise has no other coverage. # ------------------------------------------------------------------------------------------------ +# shellcheck disable=SC2329 # Functions are invoked indirectly + set -e # -- Test Parameters: These vary by test file, others are common computed values -- diff --git a/test_unstructured_ingest/src/local-single-file-with-encoding.sh b/test_unstructured_ingest/src/local-single-file-with-encoding.sh index 9034abcfbd..9b6e9991e3 100755 --- a/test_unstructured_ingest/src/local-single-file-with-encoding.sh +++ b/test_unstructured_ingest/src/local-single-file-with-encoding.sh @@ -1,5 +1,7 @@ #!/usr/bin/env bash +# shellcheck disable=SC2329 # Functions are invoked indirectly + set -e SRC_PATH=$(dirname "$(realpath "$0")") diff --git a/test_unstructured_ingest/src/local-single-file-with-pdf-infer-table-structure.sh b/test_unstructured_ingest/src/local-single-file-with-pdf-infer-table-structure.sh index 1597ffe83a..5b756983cc 100755 --- a/test_unstructured_ingest/src/local-single-file-with-pdf-infer-table-structure.sh +++ b/test_unstructured_ingest/src/local-single-file-with-pdf-infer-table-structure.sh @@ -1,5 +1,7 @@ #!/usr/bin/env bash +# shellcheck disable=SC2329 # Functions are invoked indirectly + set -e SRC_PATH=$(dirname "$(realpath "$0")") diff --git a/test_unstructured_ingest/src/local-single-file.sh b/test_unstructured_ingest/src/local-single-file.sh index d39cccc8c3..1539aeb1f2 100755 --- a/test_unstructured_ingest/src/local-single-file.sh +++ b/test_unstructured_ingest/src/local-single-file.sh @@ -1,5 +1,7 @@ #!/usr/bin/env bash +# shellcheck disable=SC2329 # Functions are invoked indirectly + set -e SRC_PATH=$(dirname "$(realpath "$0")") diff --git a/test_unstructured_ingest/src/s3-minio.sh b/test_unstructured_ingest/src/s3-minio.sh index 3a63def407..6eaad2f211 100755 --- a/test_unstructured_ingest/src/s3-minio.sh +++ b/test_unstructured_ingest/src/s3-minio.sh @@ -1,5 +1,7 @@ #!/usr/bin/env bash +# shellcheck disable=SC2329 # Functions are invoked indirectly + set -e SRC_PATH=$(dirname "$(realpath "$0")") diff --git a/test_unstructured_ingest/src/s3.sh b/test_unstructured_ingest/src/s3.sh index 228f2b9b25..df80396938 100755 --- a/test_unstructured_ingest/src/s3.sh +++ b/test_unstructured_ingest/src/s3.sh @@ -1,5 +1,7 @@ #!/usr/bin/env bash +# shellcheck disable=SC2329 # Functions are invoked indirectly + set -e SRC_PATH=$(dirname "$(realpath "$0")") diff --git a/test_unstructured_ingest/src/sharepoint.sh b/test_unstructured_ingest/src/sharepoint.sh index 9ac1444252..0e252f9402 100755 --- a/test_unstructured_ingest/src/sharepoint.sh +++ b/test_unstructured_ingest/src/sharepoint.sh @@ -1,5 +1,7 @@ #!/usr/bin/env bash +# shellcheck disable=SC2329 # Functions are invoked indirectly + set -e SRC_PATH=$(dirname "$(realpath "$0")") From 0f64893e733c1ba559c64d4fefabd10e8b1558a4 Mon Sep 17 00:00:00 2001 From: David Potter Date: Mon, 11 Aug 2025 08:50:14 -0700 Subject: [PATCH 06/19] add ambient credentials flag --- test_unstructured_ingest/src/s3-minio.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/test_unstructured_ingest/src/s3-minio.sh b/test_unstructured_ingest/src/s3-minio.sh index 6eaad2f211..1da569b5bb 100755 --- a/test_unstructured_ingest/src/s3-minio.sh +++ b/test_unstructured_ingest/src/s3-minio.sh @@ -47,6 +47,7 @@ AWS_SECRET_ACCESS_KEY=$secret_key AWS_ACCESS_KEY_ID=$access_key \ --verbose \ --remote-url s3://utic-dev-tech-fixtures/ \ --endpoint-url http://localhost:9000 \ + --ambient-credentials \ --work-dir "$WORK_DIR" \ local \ --output-dir "$OUTPUT_DIR" From 93034ec256837e25d96b13befacc739eeebbbdbe Mon Sep 17 00:00:00 2001 From: David Potter Date: Mon, 11 Aug 2025 09:12:45 -0700 Subject: [PATCH 07/19] dont use ambient --- test_unstructured_ingest/src/s3-minio.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test_unstructured_ingest/src/s3-minio.sh b/test_unstructured_ingest/src/s3-minio.sh index 1da569b5bb..333648f80c 100755 --- a/test_unstructured_ingest/src/s3-minio.sh +++ b/test_unstructured_ingest/src/s3-minio.sh @@ -35,8 +35,7 @@ scripts/minio-test-helpers/create-and-check-minio.sh wait RUN_SCRIPT=${RUN_SCRIPT:-unstructured-ingest} -AWS_SECRET_ACCESS_KEY=$secret_key AWS_ACCESS_KEY_ID=$access_key \ - PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ +PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ s3 \ --num-processes "$max_processes" \ --download-dir "$DOWNLOAD_DIR" \ @@ -47,7 +46,8 @@ AWS_SECRET_ACCESS_KEY=$secret_key AWS_ACCESS_KEY_ID=$access_key \ --verbose \ --remote-url s3://utic-dev-tech-fixtures/ \ --endpoint-url http://localhost:9000 \ - --ambient-credentials \ + --key "$access_key" \ + --secret "$secret_key" \ --work-dir "$WORK_DIR" \ local \ --output-dir "$OUTPUT_DIR" From 503a3a6a1de69eb649ada180c86fe048e8e3b462 Mon Sep 17 00:00:00 2001 From: David Potter Date: Mon, 11 Aug 2025 10:44:08 -0700 Subject: [PATCH 08/19] update test expectations --- ...iomedical-Data-Scientists-2-pages.pdf.json | 552 ++++++++++++++++++ .../pdf-fast-reprocess/IRS-form-1987.pdf.json | 1 + 2 files changed, 553 insertions(+) create mode 100644 test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.json create mode 100644 test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/IRS-form-1987.pdf.json diff --git a/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.json b/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.json new file mode 100644 index 0000000000..89dfe2a1ab --- /dev/null +++ b/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.json @@ -0,0 +1,552 @@ +[ + { + "type": "Title", + "element_id": "1e41f20785644cdea2f017cfb67bb359", + "text": "Core Skills for Biomedical Data Scientists", + "metadata": { + "page_number": 1, + "languages": [ + "eng" + ], + "filetype": "application/pdf", + "data_source": { + "record_locator": { + "path": "/Users/potter/Documents/unstructured/test_unstructured_ingest/download/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + }, + "permissions_data": [ + { + "mode": 33188 + } + ] + } + } + }, + { + "type": "Title", + "element_id": "c915a2a57c901810a698491ca2393669", + "text": "Maryam Zaringhalam, PhD, AAAS Science & Technology Policy Fellow", + "metadata": { + "page_number": 1, + "languages": [ + "eng" + ], + "filetype": "application/pdf", + "data_source": { + "record_locator": { + "path": "/Users/potter/Documents/unstructured/test_unstructured_ingest/download/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + }, + "permissions_data": [ + { + "mode": 33188 + } + ] + } + } + }, + { + "type": "Title", + "element_id": "b24c3f8d268b2f834a00966d8faef975", + "text": "Lisa Federer, MLIS, Data Science Training Coordinator", + "metadata": { + "page_number": 1, + "languages": [ + "eng" + ], + "filetype": "application/pdf", + "data_source": { + "record_locator": { + "path": "/Users/potter/Documents/unstructured/test_unstructured_ingest/download/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + }, + "permissions_data": [ + { + "mode": 33188 + } + ] + } + } + }, + { + "type": "UncategorizedText", + "element_id": "fcff333f886b39cee0a7084a9ff9204d", + "text": "Michael F. Huerta, PhD, Associate Director of NLM for Program Development and NLM Coordinator of Data Science and Open Science Initiatives", + "metadata": { + "page_number": 1, + "languages": [ + "eng" + ], + "filetype": "application/pdf", + "data_source": { + "record_locator": { + "path": "/Users/potter/Documents/unstructured/test_unstructured_ingest/download/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + }, + "permissions_data": [ + { + "mode": 33188 + } + ] + } + } + }, + { + "type": "Title", + "element_id": "1b86fad341db35208d75a543bcf819ae", + "text": "Executive Summary", + "metadata": { + "page_number": 1, + "languages": [ + "eng" + ], + "filetype": "application/pdf", + "data_source": { + "record_locator": { + "path": "/Users/potter/Documents/unstructured/test_unstructured_ingest/download/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + }, + "permissions_data": [ + { + "mode": 33188 + } + ] + } + } + }, + { + "type": "NarrativeText", + "element_id": "fee71d4f7ef7a5f253a44f6df648d12a", + "text": "This report provides recommendations for a minimal set of core skills for biomedical data scientists based on analysis that draws on opinions of data scientists, curricula for existing biomedical data science programs, and requirements for biomedical data science jobs. Suggested high-level core skills include:", + "metadata": { + "page_number": 1, + "languages": [ + "eng" + ], + "filetype": "application/pdf", + "data_source": { + "record_locator": { + "path": "/Users/potter/Documents/unstructured/test_unstructured_ingest/download/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + }, + "permissions_data": [ + { + "mode": 33188 + } + ] + } + } + }, + { + "type": "ListItem", + "element_id": "caa3c2eba90fedb7c8923ae8cd8de961", + "text": "1. General biomedical subject matter knowledge: biomedical data scientists should have a general working knowledge of the principles of biology, bioinformatics, and basic clinical science;", + "metadata": { + "page_number": 1, + "languages": [ + "eng" + ], + "filetype": "application/pdf", + "data_source": { + "record_locator": { + "path": "/Users/potter/Documents/unstructured/test_unstructured_ingest/download/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + }, + "permissions_data": [ + { + "mode": 33188 + } + ] + } + } + }, + { + "type": "ListItem", + "element_id": "f2ecce91323f01402aa06611385262ef", + "text": "2. Programming language expertise: biomedical data scientists should be fluent in at", + "metadata": { + "page_number": 1, + "languages": [ + "eng" + ], + "filetype": "application/pdf", + "data_source": { + "record_locator": { + "path": "/Users/potter/Documents/unstructured/test_unstructured_ingest/download/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + }, + "permissions_data": [ + { + "mode": 33188 + } + ] + } + } + }, + { + "type": "Title", + "element_id": "6b6645c408540ac22f4fd4be06820271", + "text": "least one programming language (typically R and/or Python);", + "metadata": { + "page_number": 1, + "languages": [ + "eng" + ], + "filetype": "application/pdf", + "data_source": { + "record_locator": { + "path": "/Users/potter/Documents/unstructured/test_unstructured_ingest/download/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + }, + "permissions_data": [ + { + "mode": 33188 + } + ] + } + } + }, + { + "type": "ListItem", + "element_id": "eb7d0e257b4f0178cdce46cd57b33dc2", + "text": "3. Predictive analytics, modeling, and machine learning: while a range of statistical methods may be useful, predictive analytics, modeling, and machine learning emerged as especially important skills in biomedical data science;", + "metadata": { + "page_number": 1, + "languages": [ + "eng" + ], + "filetype": "application/pdf", + "data_source": { + "record_locator": { + "path": "/Users/potter/Documents/unstructured/test_unstructured_ingest/download/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + }, + "permissions_data": [ + { + "mode": 33188 + } + ] + } + } + }, + { + "type": "ListItem", + "element_id": "1a174e104169cb41cf69393a9cdc0872", + "text": "4. Team science and scientific communication: “soft” skills, like the ability to work well on teams and communicate effectively in both verbal and written venues, may be as important as the more technical skills typically associated with data science.", + "metadata": { + "page_number": 1, + "languages": [ + "eng" + ], + "filetype": "application/pdf", + "data_source": { + "record_locator": { + "path": "/Users/potter/Documents/unstructured/test_unstructured_ingest/download/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + }, + "permissions_data": [ + { + "mode": 33188 + } + ] + } + } + }, + { + "type": "ListItem", + "element_id": "57314792fe7a371933b2116bc8242622", + "text": "5. Responsible data stewardship: a successful data scientist must be able to implement best practices for data management and stewardship, as well as conduct research in an ethical manner that maintains data security and privacy.", + "metadata": { + "page_number": 1, + "languages": [ + "eng" + ], + "filetype": "application/pdf", + "data_source": { + "record_locator": { + "path": "/Users/potter/Documents/unstructured/test_unstructured_ingest/download/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + }, + "permissions_data": [ + { + "mode": 33188 + } + ] + } + } + }, + { + "type": "UncategorizedText", + "element_id": "26c704088ae82677871f8f8abd78459c", + "text": "The report further details specific skills and expertise relevant to biomedical data scientists.", + "metadata": { + "page_number": 1, + "languages": [ + "eng" + ], + "filetype": "application/pdf", + "data_source": { + "record_locator": { + "path": "/Users/potter/Documents/unstructured/test_unstructured_ingest/download/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + }, + "permissions_data": [ + { + "mode": 33188 + } + ] + } + } + }, + { + "type": "Title", + "element_id": "ce78773a1364f6be706f3a5b11d50179", + "text": "Motivation", + "metadata": { + "page_number": 1, + "languages": [ + "eng" + ], + "filetype": "application/pdf", + "data_source": { + "record_locator": { + "path": "/Users/potter/Documents/unstructured/test_unstructured_ingest/download/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + }, + "permissions_data": [ + { + "mode": 33188 + } + ] + } + } + }, + { + "type": "NarrativeText", + "element_id": "690b79e1d449426afb07ed40866a6bb6", + "text": "Training a biomedical data science (BDS) workforce is a central theme in NLM’s Strategic Plan for the coming decade. That commitment is echoed in the NIH-wide Big Data to Knowledge (BD2K) initiative, which invested $61 million between FY2014 and FY2017 in training programs for the development and use of biomedical big data science methods and tools. In line with", + "metadata": { + "page_number": 1, + "languages": [ + "eng" + ], + "filetype": "application/pdf", + "data_source": { + "record_locator": { + "path": "/Users/potter/Documents/unstructured/test_unstructured_ingest/download/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + }, + "permissions_data": [ + { + "mode": 33188 + } + ] + } + } + }, + { + "type": "Header", + "element_id": "b810a8721369c3551c942aab9011b7d1", + "text": "Core Skills for Biomedical Data Scientists _____________________________________________________________________________________________", + "metadata": { + "page_number": 2, + "languages": [ + "eng" + ], + "filetype": "application/pdf", + "data_source": { + "record_locator": { + "path": "/Users/potter/Documents/unstructured/test_unstructured_ingest/download/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + }, + "permissions_data": [ + { + "mode": 33188 + } + ] + } + } + }, + { + "type": "NarrativeText", + "element_id": "c8fdefac1ae82fa42caeceff04853415", + "text": "this commitment, a recent report to the NLM Director recommended working across NIH to identify and develop core skills required of a biomedical data scientist to consistency across the cohort of NIH-trained data scientists. This report provides a set of recommended core skills based on analysis of current BD2K-funded training programs, biomedical data science job ads, and practicing members of the current data science workforce.", + "metadata": { + "page_number": 2, + "languages": [ + "eng" + ], + "filetype": "application/pdf", + "data_source": { + "record_locator": { + "path": "/Users/potter/Documents/unstructured/test_unstructured_ingest/download/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + }, + "permissions_data": [ + { + "mode": 33188 + } + ] + } + } + }, + { + "type": "Title", + "element_id": "b5b7392d0a946f5016bfa8ad0c248a9b", + "text": "Methodology", + "metadata": { + "page_number": 2, + "languages": [ + "eng" + ], + "filetype": "application/pdf", + "data_source": { + "record_locator": { + "path": "/Users/potter/Documents/unstructured/test_unstructured_ingest/download/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + }, + "permissions_data": [ + { + "mode": 33188 + } + ] + } + } + }, + { + "type": "NarrativeText", + "element_id": "d9d8e38d221ae621c0ddbcabaa4a28b4", + "text": "The Workforce Excellence team took a three-pronged approach to identifying core skills required of a biomedical data scientist (BDS), drawing from:", + "metadata": { + "page_number": 2, + "languages": [ + "eng" + ], + "filetype": "application/pdf", + "data_source": { + "record_locator": { + "path": "/Users/potter/Documents/unstructured/test_unstructured_ingest/download/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + }, + "permissions_data": [ + { + "mode": 33188 + } + ] + } + } + }, + { + "type": "NarrativeText", + "element_id": "ba70aa3bc3ad0dec6a62939c94c5a20c", + "text": "a) Responses to a 2017 Kaggle1 survey2 of over 16,000 self-identified data scientists working across many industries. Analysis of the Kaggle survey responses from the current data science workforce provided insights into the current generation of data scientists, including how they were trained and what programming and analysis skills they use.", + "metadata": { + "page_number": 2, + "languages": [ + "eng" + ], + "filetype": "application/pdf", + "data_source": { + "record_locator": { + "path": "/Users/potter/Documents/unstructured/test_unstructured_ingest/download/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + }, + "permissions_data": [ + { + "mode": 33188 + } + ] + } + } + }, + { + "type": "NarrativeText", + "element_id": "24724b1f0d20a6575f2782fd525c562f", + "text": "b) Data science skills taught in BD2K-funded training programs. A qualitative content analysis was applied to the descriptions of required courses offered under the 12 BD2K-funded training programs. Each course was coded using qualitative data analysis software, with each skill that was present in the description counted once. The coding schema of data science-related skills was inductively developed and was organized into four major categories: (1) statistics and math skills; (2) computer science; (3) subject knowledge; (4) general skills, like communication and teamwork. The coding schema is detailed in Appendix A.", + "metadata": { + "page_number": 2, + "languages": [ + "eng" + ], + "filetype": "application/pdf", + "data_source": { + "record_locator": { + "path": "/Users/potter/Documents/unstructured/test_unstructured_ingest/download/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + }, + "permissions_data": [ + { + "mode": 33188 + } + ] + } + } + }, + { + "type": "NarrativeText", + "element_id": "5e6c73154a1e5f74780c69afbc9bc084", + "text": "c) Desired skills identified from data science-related job ads. 59 job ads from government (8.5%), academia (42.4%), industry (33.9%), and the nonprofit sector (15.3%) were sampled from websites like Glassdoor, Linkedin, and Ziprecruiter. The content analysis methodology and coding schema utilized in analyzing the training programs were applied to the job descriptions. Because many job ads mentioned the same skill more than once, each occurrence of the skill was coded, therefore weighting important skills that were mentioned multiple times in a single ad.", + "metadata": { + "page_number": 2, + "languages": [ + "eng" + ], + "filetype": "application/pdf", + "data_source": { + "record_locator": { + "path": "/Users/potter/Documents/unstructured/test_unstructured_ingest/download/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + }, + "permissions_data": [ + { + "mode": 33188 + } + ] + } + } + }, + { + "type": "NarrativeText", + "element_id": "249f6c76b2c99dadbefb8b8811b0d4cd", + "text": "Analysis of the above data provided insights into the current state of biomedical data science training, as well as a view into data science-related skills likely to be needed to prepare the BDS workforce to succeed in the future. Together, these analyses informed recommendations for core skills necessary for a competitive biomedical data scientist.", + "metadata": { + "page_number": 2, + "languages": [ + "eng" + ], + "filetype": "application/pdf", + "data_source": { + "record_locator": { + "path": "/Users/potter/Documents/unstructured/test_unstructured_ingest/download/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + }, + "permissions_data": [ + { + "mode": 33188 + } + ] + } + } + }, + { + "type": "NarrativeText", + "element_id": "6543ce4e447de8fb3db98ceb06a50c28", + "text": "1 Kaggle is an online community for data scientists, serving as a platform for collaboration, competition, and learning: http://kaggle.com 2 In August 2017, Kaggle conducted an industry-wide survey to gain a clearer picture of the state of data science and machine learning. A standard set of questions were asked of all respondents, with more specific questions related to work for employed data scientists and questions related to learning for data scientists in training. Methodology and results: https://www.kaggle.com/kaggle/kaggle-survey-2017", + "metadata": { + "page_number": 2, + "languages": [ + "eng" + ], + "filetype": "application/pdf", + "data_source": { + "record_locator": { + "path": "/Users/potter/Documents/unstructured/test_unstructured_ingest/download/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + }, + "permissions_data": [ + { + "mode": 33188 + } + ] + } + } + }, + { + "type": "Footer", + "element_id": "1a6ff96d028f18331a9d9c9748b49321", + "text": "2", + "metadata": { + "page_number": 2, + "languages": [ + "eng" + ], + "filetype": "application/pdf", + "data_source": { + "record_locator": { + "path": "/Users/potter/Documents/unstructured/test_unstructured_ingest/download/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + }, + "permissions_data": [ + { + "mode": 33188 + } + ] + } + } + } +] \ No newline at end of file diff --git a/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/IRS-form-1987.pdf.json b/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/IRS-form-1987.pdf.json new file mode 100644 index 0000000000..0637a088a0 --- /dev/null +++ b/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/IRS-form-1987.pdf.json @@ -0,0 +1 @@ +[] \ No newline at end of file From 4e13bba60395636941da7714f0bdbdfff10142db Mon Sep 17 00:00:00 2001 From: David Potter Date: Mon, 11 Aug 2025 11:05:49 -0700 Subject: [PATCH 09/19] remove files that weren't supposed to be there --- ...iomedical-Data-Scientists-2-pages.pdf.json | 552 ------------------ .../pdf-fast-reprocess/IRS-form-1987.pdf.json | 1 - 2 files changed, 553 deletions(-) delete mode 100644 test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.json delete mode 100644 test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/IRS-form-1987.pdf.json diff --git a/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.json b/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.json deleted file mode 100644 index 89dfe2a1ab..0000000000 --- a/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.json +++ /dev/null @@ -1,552 +0,0 @@ -[ - { - "type": "Title", - "element_id": "1e41f20785644cdea2f017cfb67bb359", - "text": "Core Skills for Biomedical Data Scientists", - "metadata": { - "page_number": 1, - "languages": [ - "eng" - ], - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/Users/potter/Documents/unstructured/test_unstructured_ingest/download/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "c915a2a57c901810a698491ca2393669", - "text": "Maryam Zaringhalam, PhD, AAAS Science & Technology Policy Fellow", - "metadata": { - "page_number": 1, - "languages": [ - "eng" - ], - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/Users/potter/Documents/unstructured/test_unstructured_ingest/download/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "b24c3f8d268b2f834a00966d8faef975", - "text": "Lisa Federer, MLIS, Data Science Training Coordinator", - "metadata": { - "page_number": 1, - "languages": [ - "eng" - ], - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/Users/potter/Documents/unstructured/test_unstructured_ingest/download/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "fcff333f886b39cee0a7084a9ff9204d", - "text": "Michael F. Huerta, PhD, Associate Director of NLM for Program Development and NLM Coordinator of Data Science and Open Science Initiatives", - "metadata": { - "page_number": 1, - "languages": [ - "eng" - ], - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/Users/potter/Documents/unstructured/test_unstructured_ingest/download/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "1b86fad341db35208d75a543bcf819ae", - "text": "Executive Summary", - "metadata": { - "page_number": 1, - "languages": [ - "eng" - ], - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/Users/potter/Documents/unstructured/test_unstructured_ingest/download/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "fee71d4f7ef7a5f253a44f6df648d12a", - "text": "This report provides recommendations for a minimal set of core skills for biomedical data scientists based on analysis that draws on opinions of data scientists, curricula for existing biomedical data science programs, and requirements for biomedical data science jobs. Suggested high-level core skills include:", - "metadata": { - "page_number": 1, - "languages": [ - "eng" - ], - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/Users/potter/Documents/unstructured/test_unstructured_ingest/download/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "ListItem", - "element_id": "caa3c2eba90fedb7c8923ae8cd8de961", - "text": "1. General biomedical subject matter knowledge: biomedical data scientists should have a general working knowledge of the principles of biology, bioinformatics, and basic clinical science;", - "metadata": { - "page_number": 1, - "languages": [ - "eng" - ], - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/Users/potter/Documents/unstructured/test_unstructured_ingest/download/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "ListItem", - "element_id": "f2ecce91323f01402aa06611385262ef", - "text": "2. Programming language expertise: biomedical data scientists should be fluent in at", - "metadata": { - "page_number": 1, - "languages": [ - "eng" - ], - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/Users/potter/Documents/unstructured/test_unstructured_ingest/download/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "6b6645c408540ac22f4fd4be06820271", - "text": "least one programming language (typically R and/or Python);", - "metadata": { - "page_number": 1, - "languages": [ - "eng" - ], - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/Users/potter/Documents/unstructured/test_unstructured_ingest/download/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "ListItem", - "element_id": "eb7d0e257b4f0178cdce46cd57b33dc2", - "text": "3. Predictive analytics, modeling, and machine learning: while a range of statistical methods may be useful, predictive analytics, modeling, and machine learning emerged as especially important skills in biomedical data science;", - "metadata": { - "page_number": 1, - "languages": [ - "eng" - ], - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/Users/potter/Documents/unstructured/test_unstructured_ingest/download/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "ListItem", - "element_id": "1a174e104169cb41cf69393a9cdc0872", - "text": "4. Team science and scientific communication: “soft” skills, like the ability to work well on teams and communicate effectively in both verbal and written venues, may be as important as the more technical skills typically associated with data science.", - "metadata": { - "page_number": 1, - "languages": [ - "eng" - ], - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/Users/potter/Documents/unstructured/test_unstructured_ingest/download/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "ListItem", - "element_id": "57314792fe7a371933b2116bc8242622", - "text": "5. Responsible data stewardship: a successful data scientist must be able to implement best practices for data management and stewardship, as well as conduct research in an ethical manner that maintains data security and privacy.", - "metadata": { - "page_number": 1, - "languages": [ - "eng" - ], - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/Users/potter/Documents/unstructured/test_unstructured_ingest/download/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "UncategorizedText", - "element_id": "26c704088ae82677871f8f8abd78459c", - "text": "The report further details specific skills and expertise relevant to biomedical data scientists.", - "metadata": { - "page_number": 1, - "languages": [ - "eng" - ], - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/Users/potter/Documents/unstructured/test_unstructured_ingest/download/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "ce78773a1364f6be706f3a5b11d50179", - "text": "Motivation", - "metadata": { - "page_number": 1, - "languages": [ - "eng" - ], - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/Users/potter/Documents/unstructured/test_unstructured_ingest/download/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "690b79e1d449426afb07ed40866a6bb6", - "text": "Training a biomedical data science (BDS) workforce is a central theme in NLM’s Strategic Plan for the coming decade. That commitment is echoed in the NIH-wide Big Data to Knowledge (BD2K) initiative, which invested $61 million between FY2014 and FY2017 in training programs for the development and use of biomedical big data science methods and tools. In line with", - "metadata": { - "page_number": 1, - "languages": [ - "eng" - ], - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/Users/potter/Documents/unstructured/test_unstructured_ingest/download/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Header", - "element_id": "b810a8721369c3551c942aab9011b7d1", - "text": "Core Skills for Biomedical Data Scientists _____________________________________________________________________________________________", - "metadata": { - "page_number": 2, - "languages": [ - "eng" - ], - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/Users/potter/Documents/unstructured/test_unstructured_ingest/download/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "c8fdefac1ae82fa42caeceff04853415", - "text": "this commitment, a recent report to the NLM Director recommended working across NIH to identify and develop core skills required of a biomedical data scientist to consistency across the cohort of NIH-trained data scientists. This report provides a set of recommended core skills based on analysis of current BD2K-funded training programs, biomedical data science job ads, and practicing members of the current data science workforce.", - "metadata": { - "page_number": 2, - "languages": [ - "eng" - ], - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/Users/potter/Documents/unstructured/test_unstructured_ingest/download/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Title", - "element_id": "b5b7392d0a946f5016bfa8ad0c248a9b", - "text": "Methodology", - "metadata": { - "page_number": 2, - "languages": [ - "eng" - ], - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/Users/potter/Documents/unstructured/test_unstructured_ingest/download/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "d9d8e38d221ae621c0ddbcabaa4a28b4", - "text": "The Workforce Excellence team took a three-pronged approach to identifying core skills required of a biomedical data scientist (BDS), drawing from:", - "metadata": { - "page_number": 2, - "languages": [ - "eng" - ], - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/Users/potter/Documents/unstructured/test_unstructured_ingest/download/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "ba70aa3bc3ad0dec6a62939c94c5a20c", - "text": "a) Responses to a 2017 Kaggle1 survey2 of over 16,000 self-identified data scientists working across many industries. Analysis of the Kaggle survey responses from the current data science workforce provided insights into the current generation of data scientists, including how they were trained and what programming and analysis skills they use.", - "metadata": { - "page_number": 2, - "languages": [ - "eng" - ], - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/Users/potter/Documents/unstructured/test_unstructured_ingest/download/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "24724b1f0d20a6575f2782fd525c562f", - "text": "b) Data science skills taught in BD2K-funded training programs. A qualitative content analysis was applied to the descriptions of required courses offered under the 12 BD2K-funded training programs. Each course was coded using qualitative data analysis software, with each skill that was present in the description counted once. The coding schema of data science-related skills was inductively developed and was organized into four major categories: (1) statistics and math skills; (2) computer science; (3) subject knowledge; (4) general skills, like communication and teamwork. The coding schema is detailed in Appendix A.", - "metadata": { - "page_number": 2, - "languages": [ - "eng" - ], - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/Users/potter/Documents/unstructured/test_unstructured_ingest/download/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "5e6c73154a1e5f74780c69afbc9bc084", - "text": "c) Desired skills identified from data science-related job ads. 59 job ads from government (8.5%), academia (42.4%), industry (33.9%), and the nonprofit sector (15.3%) were sampled from websites like Glassdoor, Linkedin, and Ziprecruiter. The content analysis methodology and coding schema utilized in analyzing the training programs were applied to the job descriptions. Because many job ads mentioned the same skill more than once, each occurrence of the skill was coded, therefore weighting important skills that were mentioned multiple times in a single ad.", - "metadata": { - "page_number": 2, - "languages": [ - "eng" - ], - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/Users/potter/Documents/unstructured/test_unstructured_ingest/download/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "249f6c76b2c99dadbefb8b8811b0d4cd", - "text": "Analysis of the above data provided insights into the current state of biomedical data science training, as well as a view into data science-related skills likely to be needed to prepare the BDS workforce to succeed in the future. Together, these analyses informed recommendations for core skills necessary for a competitive biomedical data scientist.", - "metadata": { - "page_number": 2, - "languages": [ - "eng" - ], - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/Users/potter/Documents/unstructured/test_unstructured_ingest/download/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "NarrativeText", - "element_id": "6543ce4e447de8fb3db98ceb06a50c28", - "text": "1 Kaggle is an online community for data scientists, serving as a platform for collaboration, competition, and learning: http://kaggle.com 2 In August 2017, Kaggle conducted an industry-wide survey to gain a clearer picture of the state of data science and machine learning. A standard set of questions were asked of all respondents, with more specific questions related to work for employed data scientists and questions related to learning for data scientists in training. Methodology and results: https://www.kaggle.com/kaggle/kaggle-survey-2017", - "metadata": { - "page_number": 2, - "languages": [ - "eng" - ], - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/Users/potter/Documents/unstructured/test_unstructured_ingest/download/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - }, - { - "type": "Footer", - "element_id": "1a6ff96d028f18331a9d9c9748b49321", - "text": "2", - "metadata": { - "page_number": 2, - "languages": [ - "eng" - ], - "filetype": "application/pdf", - "data_source": { - "record_locator": { - "path": "/Users/potter/Documents/unstructured/test_unstructured_ingest/download/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" - }, - "permissions_data": [ - { - "mode": 33188 - } - ] - } - } - } -] \ No newline at end of file diff --git a/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/IRS-form-1987.pdf.json b/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/IRS-form-1987.pdf.json deleted file mode 100644 index 0637a088a0..0000000000 --- a/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/IRS-form-1987.pdf.json +++ /dev/null @@ -1 +0,0 @@ -[] \ No newline at end of file From 8a0113ec353934a32102fcc878c47a8854647d5b Mon Sep 17 00:00:00 2001 From: David Potter Date: Mon, 11 Aug 2025 11:42:52 -0700 Subject: [PATCH 10/19] disable test to see how it affects others --- test_unstructured_ingest/src/pdf-fast-reprocess.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test_unstructured_ingest/src/pdf-fast-reprocess.sh b/test_unstructured_ingest/src/pdf-fast-reprocess.sh index 1f22cab06c..ec816f61cc 100755 --- a/test_unstructured_ingest/src/pdf-fast-reprocess.sh +++ b/test_unstructured_ingest/src/pdf-fast-reprocess.sh @@ -43,4 +43,4 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ local \ --output-dir "$OUTPUT_DIR" -"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME +# "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME From 15d44db8e9d128de6cc8a221417e14839306d85b Mon Sep 17 00:00:00 2001 From: David Potter Date: Mon, 11 Aug 2025 13:31:51 -0700 Subject: [PATCH 11/19] reactivate test --- test_unstructured_ingest/src/pdf-fast-reprocess.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test_unstructured_ingest/src/pdf-fast-reprocess.sh b/test_unstructured_ingest/src/pdf-fast-reprocess.sh index ec816f61cc..1f22cab06c 100755 --- a/test_unstructured_ingest/src/pdf-fast-reprocess.sh +++ b/test_unstructured_ingest/src/pdf-fast-reprocess.sh @@ -43,4 +43,4 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ local \ --output-dir "$OUTPUT_DIR" -# "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME +"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME From 260da403140600d1daabeab77dd29ae076724528 Mon Sep 17 00:00:00 2001 From: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Date: Mon, 11 Aug 2025 15:14:32 -0700 Subject: [PATCH 12/19] email date format flexibility <- Ingest test fixtures update (#4074) This pull request includes updated ingest test fixtures. Please review and merge if appropriate. Co-authored-by: potter-potter --- ...iomedical-Data-Scientists-2-pages.pdf.html | 0 .../IRS-form-1987.pdf.html | 0 ...iomedical-Data-Scientists-2-pages.pdf.json | 50 +++++++++---------- .../IRS-form-1987.pdf.json | 0 4 files changed, 25 insertions(+), 25 deletions(-) rename test_unstructured_ingest/expected-structured-output-html/pdf-fast-reprocess/azure/{ => unstructured_a5b__x87}/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.html (100%) rename test_unstructured_ingest/expected-structured-output-html/pdf-fast-reprocess/azure/{ => unstructured_hlebpmyz}/IRS-form-1987.pdf.html (100%) rename test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/azure/{ => unstructured_a5b__x87}/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.json (85%) rename test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/azure/{ => unstructured_hlebpmyz}/IRS-form-1987.pdf.json (100%) diff --git a/test_unstructured_ingest/expected-structured-output-html/pdf-fast-reprocess/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.html b/test_unstructured_ingest/expected-structured-output-html/pdf-fast-reprocess/azure/unstructured_a5b__x87/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.html similarity index 100% rename from test_unstructured_ingest/expected-structured-output-html/pdf-fast-reprocess/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.html rename to test_unstructured_ingest/expected-structured-output-html/pdf-fast-reprocess/azure/unstructured_a5b__x87/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.html diff --git a/test_unstructured_ingest/expected-structured-output-html/pdf-fast-reprocess/azure/IRS-form-1987.pdf.html b/test_unstructured_ingest/expected-structured-output-html/pdf-fast-reprocess/azure/unstructured_hlebpmyz/IRS-form-1987.pdf.html similarity index 100% rename from test_unstructured_ingest/expected-structured-output-html/pdf-fast-reprocess/azure/IRS-form-1987.pdf.html rename to test_unstructured_ingest/expected-structured-output-html/pdf-fast-reprocess/azure/unstructured_hlebpmyz/IRS-form-1987.pdf.html diff --git a/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.json b/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/azure/unstructured_a5b__x87/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.json similarity index 85% rename from test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.json rename to test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/azure/unstructured_a5b__x87/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.json index d739778f43..cb80160045 100644 --- a/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.json +++ b/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/azure/unstructured_a5b__x87/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.json @@ -11,7 +11,7 @@ "filetype": "application/pdf", "data_source": { "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/unstructured_a5b__x87/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" }, "permissions_data": [ { @@ -33,7 +33,7 @@ "filetype": "application/pdf", "data_source": { "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/unstructured_a5b__x87/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" }, "permissions_data": [ { @@ -55,7 +55,7 @@ "filetype": "application/pdf", "data_source": { "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/unstructured_a5b__x87/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" }, "permissions_data": [ { @@ -77,7 +77,7 @@ "filetype": "application/pdf", "data_source": { "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/unstructured_a5b__x87/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" }, "permissions_data": [ { @@ -99,7 +99,7 @@ "filetype": "application/pdf", "data_source": { "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/unstructured_a5b__x87/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" }, "permissions_data": [ { @@ -121,7 +121,7 @@ "filetype": "application/pdf", "data_source": { "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/unstructured_a5b__x87/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" }, "permissions_data": [ { @@ -143,7 +143,7 @@ "filetype": "application/pdf", "data_source": { "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/unstructured_a5b__x87/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" }, "permissions_data": [ { @@ -165,7 +165,7 @@ "filetype": "application/pdf", "data_source": { "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/unstructured_a5b__x87/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" }, "permissions_data": [ { @@ -187,7 +187,7 @@ "filetype": "application/pdf", "data_source": { "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/unstructured_a5b__x87/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" }, "permissions_data": [ { @@ -209,7 +209,7 @@ "filetype": "application/pdf", "data_source": { "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/unstructured_a5b__x87/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" }, "permissions_data": [ { @@ -231,7 +231,7 @@ "filetype": "application/pdf", "data_source": { "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/unstructured_a5b__x87/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" }, "permissions_data": [ { @@ -253,7 +253,7 @@ "filetype": "application/pdf", "data_source": { "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/unstructured_a5b__x87/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" }, "permissions_data": [ { @@ -275,7 +275,7 @@ "filetype": "application/pdf", "data_source": { "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/unstructured_a5b__x87/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" }, "permissions_data": [ { @@ -297,7 +297,7 @@ "filetype": "application/pdf", "data_source": { "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/unstructured_a5b__x87/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" }, "permissions_data": [ { @@ -319,7 +319,7 @@ "filetype": "application/pdf", "data_source": { "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/unstructured_a5b__x87/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" }, "permissions_data": [ { @@ -341,7 +341,7 @@ "filetype": "application/pdf", "data_source": { "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/unstructured_a5b__x87/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" }, "permissions_data": [ { @@ -363,7 +363,7 @@ "filetype": "application/pdf", "data_source": { "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/unstructured_a5b__x87/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" }, "permissions_data": [ { @@ -385,7 +385,7 @@ "filetype": "application/pdf", "data_source": { "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/unstructured_a5b__x87/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" }, "permissions_data": [ { @@ -407,7 +407,7 @@ "filetype": "application/pdf", "data_source": { "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/unstructured_a5b__x87/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" }, "permissions_data": [ { @@ -429,7 +429,7 @@ "filetype": "application/pdf", "data_source": { "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/unstructured_a5b__x87/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" }, "permissions_data": [ { @@ -451,7 +451,7 @@ "filetype": "application/pdf", "data_source": { "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/unstructured_a5b__x87/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" }, "permissions_data": [ { @@ -473,7 +473,7 @@ "filetype": "application/pdf", "data_source": { "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/unstructured_a5b__x87/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" }, "permissions_data": [ { @@ -495,7 +495,7 @@ "filetype": "application/pdf", "data_source": { "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/unstructured_a5b__x87/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" }, "permissions_data": [ { @@ -517,7 +517,7 @@ "filetype": "application/pdf", "data_source": { "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/unstructured_a5b__x87/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" }, "permissions_data": [ { @@ -539,7 +539,7 @@ "filetype": "application/pdf", "data_source": { "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/unstructured_a5b__x87/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" }, "permissions_data": [ { diff --git a/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/azure/IRS-form-1987.pdf.json b/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/azure/unstructured_hlebpmyz/IRS-form-1987.pdf.json similarity index 100% rename from test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/azure/IRS-form-1987.pdf.json rename to test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/azure/unstructured_hlebpmyz/IRS-form-1987.pdf.json From c5fb6eea8a5a4b236b6a4585e4571d73b12c94e9 Mon Sep 17 00:00:00 2001 From: David Potter Date: Mon, 11 Aug 2025 16:42:24 -0700 Subject: [PATCH 13/19] reset test changes --- ...iomedical-Data-Scientists-2-pages.pdf.html | 0 .../IRS-form-1987.pdf.html | 0 ...iomedical-Data-Scientists-2-pages.pdf.json | 50 +++++++++---------- .../IRS-form-1987.pdf.json | 0 4 files changed, 25 insertions(+), 25 deletions(-) rename test_unstructured_ingest/expected-structured-output-html/pdf-fast-reprocess/azure/{unstructured_a5b__x87 => }/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.html (100%) rename test_unstructured_ingest/expected-structured-output-html/pdf-fast-reprocess/azure/{unstructured_hlebpmyz => }/IRS-form-1987.pdf.html (100%) rename test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/azure/{unstructured_a5b__x87 => }/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.json (85%) rename test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/azure/{unstructured_hlebpmyz => }/IRS-form-1987.pdf.json (100%) diff --git a/test_unstructured_ingest/expected-structured-output-html/pdf-fast-reprocess/azure/unstructured_a5b__x87/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.html b/test_unstructured_ingest/expected-structured-output-html/pdf-fast-reprocess/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.html similarity index 100% rename from test_unstructured_ingest/expected-structured-output-html/pdf-fast-reprocess/azure/unstructured_a5b__x87/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.html rename to test_unstructured_ingest/expected-structured-output-html/pdf-fast-reprocess/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.html diff --git a/test_unstructured_ingest/expected-structured-output-html/pdf-fast-reprocess/azure/unstructured_hlebpmyz/IRS-form-1987.pdf.html b/test_unstructured_ingest/expected-structured-output-html/pdf-fast-reprocess/azure/IRS-form-1987.pdf.html similarity index 100% rename from test_unstructured_ingest/expected-structured-output-html/pdf-fast-reprocess/azure/unstructured_hlebpmyz/IRS-form-1987.pdf.html rename to test_unstructured_ingest/expected-structured-output-html/pdf-fast-reprocess/azure/IRS-form-1987.pdf.html diff --git a/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/azure/unstructured_a5b__x87/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.json b/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.json similarity index 85% rename from test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/azure/unstructured_a5b__x87/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.json rename to test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.json index cb80160045..d739778f43 100644 --- a/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/azure/unstructured_a5b__x87/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.json +++ b/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.json @@ -11,7 +11,7 @@ "filetype": "application/pdf", "data_source": { "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/unstructured_a5b__x87/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" }, "permissions_data": [ { @@ -33,7 +33,7 @@ "filetype": "application/pdf", "data_source": { "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/unstructured_a5b__x87/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" }, "permissions_data": [ { @@ -55,7 +55,7 @@ "filetype": "application/pdf", "data_source": { "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/unstructured_a5b__x87/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" }, "permissions_data": [ { @@ -77,7 +77,7 @@ "filetype": "application/pdf", "data_source": { "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/unstructured_a5b__x87/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" }, "permissions_data": [ { @@ -99,7 +99,7 @@ "filetype": "application/pdf", "data_source": { "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/unstructured_a5b__x87/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" }, "permissions_data": [ { @@ -121,7 +121,7 @@ "filetype": "application/pdf", "data_source": { "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/unstructured_a5b__x87/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" }, "permissions_data": [ { @@ -143,7 +143,7 @@ "filetype": "application/pdf", "data_source": { "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/unstructured_a5b__x87/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" }, "permissions_data": [ { @@ -165,7 +165,7 @@ "filetype": "application/pdf", "data_source": { "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/unstructured_a5b__x87/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" }, "permissions_data": [ { @@ -187,7 +187,7 @@ "filetype": "application/pdf", "data_source": { "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/unstructured_a5b__x87/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" }, "permissions_data": [ { @@ -209,7 +209,7 @@ "filetype": "application/pdf", "data_source": { "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/unstructured_a5b__x87/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" }, "permissions_data": [ { @@ -231,7 +231,7 @@ "filetype": "application/pdf", "data_source": { "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/unstructured_a5b__x87/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" }, "permissions_data": [ { @@ -253,7 +253,7 @@ "filetype": "application/pdf", "data_source": { "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/unstructured_a5b__x87/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" }, "permissions_data": [ { @@ -275,7 +275,7 @@ "filetype": "application/pdf", "data_source": { "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/unstructured_a5b__x87/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" }, "permissions_data": [ { @@ -297,7 +297,7 @@ "filetype": "application/pdf", "data_source": { "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/unstructured_a5b__x87/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" }, "permissions_data": [ { @@ -319,7 +319,7 @@ "filetype": "application/pdf", "data_source": { "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/unstructured_a5b__x87/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" }, "permissions_data": [ { @@ -341,7 +341,7 @@ "filetype": "application/pdf", "data_source": { "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/unstructured_a5b__x87/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" }, "permissions_data": [ { @@ -363,7 +363,7 @@ "filetype": "application/pdf", "data_source": { "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/unstructured_a5b__x87/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" }, "permissions_data": [ { @@ -385,7 +385,7 @@ "filetype": "application/pdf", "data_source": { "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/unstructured_a5b__x87/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" }, "permissions_data": [ { @@ -407,7 +407,7 @@ "filetype": "application/pdf", "data_source": { "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/unstructured_a5b__x87/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" }, "permissions_data": [ { @@ -429,7 +429,7 @@ "filetype": "application/pdf", "data_source": { "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/unstructured_a5b__x87/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" }, "permissions_data": [ { @@ -451,7 +451,7 @@ "filetype": "application/pdf", "data_source": { "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/unstructured_a5b__x87/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" }, "permissions_data": [ { @@ -473,7 +473,7 @@ "filetype": "application/pdf", "data_source": { "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/unstructured_a5b__x87/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" }, "permissions_data": [ { @@ -495,7 +495,7 @@ "filetype": "application/pdf", "data_source": { "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/unstructured_a5b__x87/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" }, "permissions_data": [ { @@ -517,7 +517,7 @@ "filetype": "application/pdf", "data_source": { "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/unstructured_a5b__x87/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" }, "permissions_data": [ { @@ -539,7 +539,7 @@ "filetype": "application/pdf", "data_source": { "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/unstructured_a5b__x87/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" }, "permissions_data": [ { diff --git a/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/azure/unstructured_hlebpmyz/IRS-form-1987.pdf.json b/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/azure/IRS-form-1987.pdf.json similarity index 100% rename from test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/azure/unstructured_hlebpmyz/IRS-form-1987.pdf.json rename to test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/azure/IRS-form-1987.pdf.json From 5582c640f22f8cffb9c82375873d09cdfb523ba4 Mon Sep 17 00:00:00 2001 From: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Date: Mon, 11 Aug 2025 17:52:08 -0700 Subject: [PATCH 14/19] email date format flexibility <- Ingest test fixtures update (#4075) This pull request includes updated ingest test fixtures. Please review and merge if appropriate. Co-authored-by: potter-potter --- .../IRS-form-1987.pdf.html | 0 ...iomedical-Data-Scientists-2-pages.pdf.html | 0 .../IRS-form-1987.pdf.json | 0 ...iomedical-Data-Scientists-2-pages.pdf.json | 50 +++++++++---------- 4 files changed, 25 insertions(+), 25 deletions(-) rename test_unstructured_ingest/expected-structured-output-html/pdf-fast-reprocess/azure/{ => unstructured_a4m94vgo}/IRS-form-1987.pdf.html (100%) rename test_unstructured_ingest/expected-structured-output-html/pdf-fast-reprocess/azure/{ => unstructured_oogefa5n}/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.html (100%) rename test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/azure/{ => unstructured_a4m94vgo}/IRS-form-1987.pdf.json (100%) rename test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/azure/{ => unstructured_oogefa5n}/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.json (85%) diff --git a/test_unstructured_ingest/expected-structured-output-html/pdf-fast-reprocess/azure/IRS-form-1987.pdf.html b/test_unstructured_ingest/expected-structured-output-html/pdf-fast-reprocess/azure/unstructured_a4m94vgo/IRS-form-1987.pdf.html similarity index 100% rename from test_unstructured_ingest/expected-structured-output-html/pdf-fast-reprocess/azure/IRS-form-1987.pdf.html rename to test_unstructured_ingest/expected-structured-output-html/pdf-fast-reprocess/azure/unstructured_a4m94vgo/IRS-form-1987.pdf.html diff --git a/test_unstructured_ingest/expected-structured-output-html/pdf-fast-reprocess/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.html b/test_unstructured_ingest/expected-structured-output-html/pdf-fast-reprocess/azure/unstructured_oogefa5n/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.html similarity index 100% rename from test_unstructured_ingest/expected-structured-output-html/pdf-fast-reprocess/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.html rename to test_unstructured_ingest/expected-structured-output-html/pdf-fast-reprocess/azure/unstructured_oogefa5n/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.html diff --git a/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/azure/IRS-form-1987.pdf.json b/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/azure/unstructured_a4m94vgo/IRS-form-1987.pdf.json similarity index 100% rename from test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/azure/IRS-form-1987.pdf.json rename to test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/azure/unstructured_a4m94vgo/IRS-form-1987.pdf.json diff --git a/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.json b/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/azure/unstructured_oogefa5n/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.json similarity index 85% rename from test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.json rename to test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/azure/unstructured_oogefa5n/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.json index d739778f43..c79b5cc93c 100644 --- a/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.json +++ b/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/azure/unstructured_oogefa5n/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.json @@ -11,7 +11,7 @@ "filetype": "application/pdf", "data_source": { "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/unstructured_oogefa5n/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" }, "permissions_data": [ { @@ -33,7 +33,7 @@ "filetype": "application/pdf", "data_source": { "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/unstructured_oogefa5n/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" }, "permissions_data": [ { @@ -55,7 +55,7 @@ "filetype": "application/pdf", "data_source": { "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/unstructured_oogefa5n/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" }, "permissions_data": [ { @@ -77,7 +77,7 @@ "filetype": "application/pdf", "data_source": { "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/unstructured_oogefa5n/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" }, "permissions_data": [ { @@ -99,7 +99,7 @@ "filetype": "application/pdf", "data_source": { "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/unstructured_oogefa5n/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" }, "permissions_data": [ { @@ -121,7 +121,7 @@ "filetype": "application/pdf", "data_source": { "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/unstructured_oogefa5n/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" }, "permissions_data": [ { @@ -143,7 +143,7 @@ "filetype": "application/pdf", "data_source": { "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/unstructured_oogefa5n/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" }, "permissions_data": [ { @@ -165,7 +165,7 @@ "filetype": "application/pdf", "data_source": { "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/unstructured_oogefa5n/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" }, "permissions_data": [ { @@ -187,7 +187,7 @@ "filetype": "application/pdf", "data_source": { "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/unstructured_oogefa5n/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" }, "permissions_data": [ { @@ -209,7 +209,7 @@ "filetype": "application/pdf", "data_source": { "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/unstructured_oogefa5n/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" }, "permissions_data": [ { @@ -231,7 +231,7 @@ "filetype": "application/pdf", "data_source": { "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/unstructured_oogefa5n/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" }, "permissions_data": [ { @@ -253,7 +253,7 @@ "filetype": "application/pdf", "data_source": { "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/unstructured_oogefa5n/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" }, "permissions_data": [ { @@ -275,7 +275,7 @@ "filetype": "application/pdf", "data_source": { "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/unstructured_oogefa5n/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" }, "permissions_data": [ { @@ -297,7 +297,7 @@ "filetype": "application/pdf", "data_source": { "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/unstructured_oogefa5n/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" }, "permissions_data": [ { @@ -319,7 +319,7 @@ "filetype": "application/pdf", "data_source": { "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/unstructured_oogefa5n/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" }, "permissions_data": [ { @@ -341,7 +341,7 @@ "filetype": "application/pdf", "data_source": { "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/unstructured_oogefa5n/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" }, "permissions_data": [ { @@ -363,7 +363,7 @@ "filetype": "application/pdf", "data_source": { "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/unstructured_oogefa5n/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" }, "permissions_data": [ { @@ -385,7 +385,7 @@ "filetype": "application/pdf", "data_source": { "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/unstructured_oogefa5n/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" }, "permissions_data": [ { @@ -407,7 +407,7 @@ "filetype": "application/pdf", "data_source": { "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/unstructured_oogefa5n/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" }, "permissions_data": [ { @@ -429,7 +429,7 @@ "filetype": "application/pdf", "data_source": { "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/unstructured_oogefa5n/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" }, "permissions_data": [ { @@ -451,7 +451,7 @@ "filetype": "application/pdf", "data_source": { "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/unstructured_oogefa5n/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" }, "permissions_data": [ { @@ -473,7 +473,7 @@ "filetype": "application/pdf", "data_source": { "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/unstructured_oogefa5n/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" }, "permissions_data": [ { @@ -495,7 +495,7 @@ "filetype": "application/pdf", "data_source": { "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/unstructured_oogefa5n/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" }, "permissions_data": [ { @@ -517,7 +517,7 @@ "filetype": "application/pdf", "data_source": { "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/unstructured_oogefa5n/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" }, "permissions_data": [ { @@ -539,7 +539,7 @@ "filetype": "application/pdf", "data_source": { "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/unstructured_oogefa5n/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" }, "permissions_data": [ { From 2b565a9f1f5143a65825be9fbe21d28806f96059 Mon Sep 17 00:00:00 2001 From: David Potter Date: Wed, 13 Aug 2025 09:28:48 -0700 Subject: [PATCH 15/19] restore since test expectation update did not work --- ...iomedical-Data-Scientists-2-pages.pdf.html | 0 .../IRS-form-1987.pdf.html | 0 ...iomedical-Data-Scientists-2-pages.pdf.json | 50 +++++++++---------- .../IRS-form-1987.pdf.json | 0 4 files changed, 25 insertions(+), 25 deletions(-) rename test_unstructured_ingest/expected-structured-output-html/pdf-fast-reprocess/azure/{unstructured_oogefa5n => }/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.html (100%) rename test_unstructured_ingest/expected-structured-output-html/pdf-fast-reprocess/azure/{unstructured_a4m94vgo => }/IRS-form-1987.pdf.html (100%) rename test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/azure/{unstructured_oogefa5n => }/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.json (85%) rename test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/azure/{unstructured_a4m94vgo => }/IRS-form-1987.pdf.json (100%) diff --git a/test_unstructured_ingest/expected-structured-output-html/pdf-fast-reprocess/azure/unstructured_oogefa5n/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.html b/test_unstructured_ingest/expected-structured-output-html/pdf-fast-reprocess/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.html similarity index 100% rename from test_unstructured_ingest/expected-structured-output-html/pdf-fast-reprocess/azure/unstructured_oogefa5n/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.html rename to test_unstructured_ingest/expected-structured-output-html/pdf-fast-reprocess/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.html diff --git a/test_unstructured_ingest/expected-structured-output-html/pdf-fast-reprocess/azure/unstructured_a4m94vgo/IRS-form-1987.pdf.html b/test_unstructured_ingest/expected-structured-output-html/pdf-fast-reprocess/azure/IRS-form-1987.pdf.html similarity index 100% rename from test_unstructured_ingest/expected-structured-output-html/pdf-fast-reprocess/azure/unstructured_a4m94vgo/IRS-form-1987.pdf.html rename to test_unstructured_ingest/expected-structured-output-html/pdf-fast-reprocess/azure/IRS-form-1987.pdf.html diff --git a/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/azure/unstructured_oogefa5n/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.json b/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.json similarity index 85% rename from test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/azure/unstructured_oogefa5n/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.json rename to test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.json index c79b5cc93c..d739778f43 100644 --- a/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/azure/unstructured_oogefa5n/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.json +++ b/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.json @@ -11,7 +11,7 @@ "filetype": "application/pdf", "data_source": { "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/unstructured_oogefa5n/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" }, "permissions_data": [ { @@ -33,7 +33,7 @@ "filetype": "application/pdf", "data_source": { "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/unstructured_oogefa5n/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" }, "permissions_data": [ { @@ -55,7 +55,7 @@ "filetype": "application/pdf", "data_source": { "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/unstructured_oogefa5n/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" }, "permissions_data": [ { @@ -77,7 +77,7 @@ "filetype": "application/pdf", "data_source": { "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/unstructured_oogefa5n/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" }, "permissions_data": [ { @@ -99,7 +99,7 @@ "filetype": "application/pdf", "data_source": { "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/unstructured_oogefa5n/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" }, "permissions_data": [ { @@ -121,7 +121,7 @@ "filetype": "application/pdf", "data_source": { "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/unstructured_oogefa5n/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" }, "permissions_data": [ { @@ -143,7 +143,7 @@ "filetype": "application/pdf", "data_source": { "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/unstructured_oogefa5n/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" }, "permissions_data": [ { @@ -165,7 +165,7 @@ "filetype": "application/pdf", "data_source": { "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/unstructured_oogefa5n/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" }, "permissions_data": [ { @@ -187,7 +187,7 @@ "filetype": "application/pdf", "data_source": { "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/unstructured_oogefa5n/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" }, "permissions_data": [ { @@ -209,7 +209,7 @@ "filetype": "application/pdf", "data_source": { "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/unstructured_oogefa5n/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" }, "permissions_data": [ { @@ -231,7 +231,7 @@ "filetype": "application/pdf", "data_source": { "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/unstructured_oogefa5n/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" }, "permissions_data": [ { @@ -253,7 +253,7 @@ "filetype": "application/pdf", "data_source": { "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/unstructured_oogefa5n/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" }, "permissions_data": [ { @@ -275,7 +275,7 @@ "filetype": "application/pdf", "data_source": { "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/unstructured_oogefa5n/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" }, "permissions_data": [ { @@ -297,7 +297,7 @@ "filetype": "application/pdf", "data_source": { "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/unstructured_oogefa5n/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" }, "permissions_data": [ { @@ -319,7 +319,7 @@ "filetype": "application/pdf", "data_source": { "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/unstructured_oogefa5n/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" }, "permissions_data": [ { @@ -341,7 +341,7 @@ "filetype": "application/pdf", "data_source": { "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/unstructured_oogefa5n/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" }, "permissions_data": [ { @@ -363,7 +363,7 @@ "filetype": "application/pdf", "data_source": { "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/unstructured_oogefa5n/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" }, "permissions_data": [ { @@ -385,7 +385,7 @@ "filetype": "application/pdf", "data_source": { "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/unstructured_oogefa5n/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" }, "permissions_data": [ { @@ -407,7 +407,7 @@ "filetype": "application/pdf", "data_source": { "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/unstructured_oogefa5n/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" }, "permissions_data": [ { @@ -429,7 +429,7 @@ "filetype": "application/pdf", "data_source": { "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/unstructured_oogefa5n/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" }, "permissions_data": [ { @@ -451,7 +451,7 @@ "filetype": "application/pdf", "data_source": { "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/unstructured_oogefa5n/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" }, "permissions_data": [ { @@ -473,7 +473,7 @@ "filetype": "application/pdf", "data_source": { "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/unstructured_oogefa5n/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" }, "permissions_data": [ { @@ -495,7 +495,7 @@ "filetype": "application/pdf", "data_source": { "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/unstructured_oogefa5n/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" }, "permissions_data": [ { @@ -517,7 +517,7 @@ "filetype": "application/pdf", "data_source": { "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/unstructured_oogefa5n/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" }, "permissions_data": [ { @@ -539,7 +539,7 @@ "filetype": "application/pdf", "data_source": { "record_locator": { - "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/unstructured_oogefa5n/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" + "path": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/download/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf" }, "permissions_data": [ { diff --git a/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/azure/unstructured_a4m94vgo/IRS-form-1987.pdf.json b/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/azure/IRS-form-1987.pdf.json similarity index 100% rename from test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/azure/unstructured_a4m94vgo/IRS-form-1987.pdf.json rename to test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/azure/IRS-form-1987.pdf.json From 7d7bc4249d5ef852ef438717cd7230d1797cfada Mon Sep 17 00:00:00 2001 From: David Potter Date: Wed, 13 Aug 2025 10:21:48 -0700 Subject: [PATCH 16/19] prevent new download behavior from ruining tests --- test_unstructured_ingest/src/pdf-fast-reprocess.sh | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/test_unstructured_ingest/src/pdf-fast-reprocess.sh b/test_unstructured_ingest/src/pdf-fast-reprocess.sh index 1f22cab06c..acea404945 100755 --- a/test_unstructured_ingest/src/pdf-fast-reprocess.sh +++ b/test_unstructured_ingest/src/pdf-fast-reprocess.sh @@ -43,4 +43,11 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ local \ --output-dir "$OUTPUT_DIR" +# Flatten outputs so paths match fixtures. New behavior for downloads in unstructured-ingest is to create a nested directory structure. +mkdir -p "$OUTPUT_DIR/azure" +find "$OUTPUT_DIR/azure" -type f -name '*.json' -path '*/unstructured_*/*' -print0 | while IFS= read -r -d '' f; do + mv "$f" "$OUTPUT_DIR/azure/$(basename "$f")" +done +find "$OUTPUT_DIR/azure" -type d -name 'unstructured_*' -exec rm -rf {} + + "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME From 6bd8b7b2ec449c5e6dbb16ac7c549c80c0d4d029 Mon Sep 17 00:00:00 2001 From: David Potter Date: Wed, 13 Aug 2025 10:41:32 -0700 Subject: [PATCH 17/19] also need to update the file paths itself --- test_unstructured_ingest/src/pdf-fast-reprocess.sh | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/test_unstructured_ingest/src/pdf-fast-reprocess.sh b/test_unstructured_ingest/src/pdf-fast-reprocess.sh index acea404945..59b5b0e3ec 100755 --- a/test_unstructured_ingest/src/pdf-fast-reprocess.sh +++ b/test_unstructured_ingest/src/pdf-fast-reprocess.sh @@ -50,4 +50,16 @@ find "$OUTPUT_DIR/azure" -type f -name '*.json' -path '*/unstructured_*/*' -prin done find "$OUTPUT_DIR/azure" -type d -name 'unstructured_*' -exec rm -rf {} + +# Normalize record_locator.path to drop unstructured_* in the download path +python3 - <<'PY' +import re, sys, pathlib +root = pathlib.Path(sys.argv[1]) +for p in root.rglob('*.json'): + s = p.read_text() + s2 = re.sub(r'(/download/azure)/unstructured_[^/]+/', r'\1/', s) + if s2 != s: + p.write_text(s2) +PY "$OUTPUT_DIR/azure" + + "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME From 3dfa089498206f64b44c9227c3e7f436425d2a8b Mon Sep 17 00:00:00 2001 From: David Potter Date: Wed, 13 Aug 2025 11:02:46 -0700 Subject: [PATCH 18/19] fix syntax --- test_unstructured_ingest/src/pdf-fast-reprocess.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test_unstructured_ingest/src/pdf-fast-reprocess.sh b/test_unstructured_ingest/src/pdf-fast-reprocess.sh index 59b5b0e3ec..d41b61342e 100755 --- a/test_unstructured_ingest/src/pdf-fast-reprocess.sh +++ b/test_unstructured_ingest/src/pdf-fast-reprocess.sh @@ -51,7 +51,7 @@ done find "$OUTPUT_DIR/azure" -type d -name 'unstructured_*' -exec rm -rf {} + # Normalize record_locator.path to drop unstructured_* in the download path -python3 - <<'PY' +python3 - "$OUTPUT_DIR/azure" <<'PY' import re, sys, pathlib root = pathlib.Path(sys.argv[1]) for p in root.rglob('*.json'): @@ -59,7 +59,7 @@ for p in root.rglob('*.json'): s2 = re.sub(r'(/download/azure)/unstructured_[^/]+/', r'\1/', s) if s2 != s: p.write_text(s2) -PY "$OUTPUT_DIR/azure" +PY "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME From 359e8e78a4a852ebd56c98053727c85ab2118810 Mon Sep 17 00:00:00 2001 From: David Potter Date: Wed, 13 Aug 2025 11:16:29 -0700 Subject: [PATCH 19/19] lint --- test_unstructured_ingest/src/pdf-fast-reprocess.sh | 2 -- 1 file changed, 2 deletions(-) diff --git a/test_unstructured_ingest/src/pdf-fast-reprocess.sh b/test_unstructured_ingest/src/pdf-fast-reprocess.sh index d41b61342e..f78cd94253 100755 --- a/test_unstructured_ingest/src/pdf-fast-reprocess.sh +++ b/test_unstructured_ingest/src/pdf-fast-reprocess.sh @@ -60,6 +60,4 @@ for p in root.rglob('*.json'): if s2 != s: p.write_text(s2) PY - - "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME