MobilityData · qcdyx · Sep 9, 2025 · Sep 10, 2025 · Sep 10, 2025 · Sep 12, 2025
diff --git a/functions-python/batch_process_dataset/README.md b/functions-python/batch_process_dataset/README.md
@@ -49,6 +49,23 @@ The function expects a Pub/Sub message with the following format:
     }
   }
 }
+
+{
+  "message": {
+    "data": {
+      "execution_id": "batch-trace-e5eaa516bd884c0a39861d08de301d97/2153210919778512803;o=1",
+      "producer_url": "https://www.stm.info/sites/default/files/gtfs/gtfs_stm.zip",
+      "feed_stable_id": "mdb-2126",
+      "feed_id": "9f1748c5-b482-4577-819e-ce78c75980b3",
+      "dataset_stable_id": "mdb-2126-202504170018",
+      "dataset_hash": "7d019543ee12b2a44d580d7780d71546108a2cb1c4f3bfcc5cf3ee97b847fafd",
+      "authentication_type": "0",
+      "authentication_info_url": "",
+      "api_key_parameter_name": ""
+    }
+  }
+}
+
 ```
 
 # Function configuration

diff --git a/functions-python/batch_process_dataset/src/main.py b/functions-python/batch_process_dataset/src/main.py
@@ -18,6 +18,8 @@
 import json
 import logging
 import os
+from pathlib import Path
+
 import random
 import uuid
 import zipfile
@@ -59,6 +61,14 @@ class DatasetFile:
     zipped_size: Optional[int] = None
 
 
+def peek_bytes(path: str, n: int = 64) -> bytes:
+    p = Path(path)
+    if not p.exists():
+        return b""
+    with open(p, "rb") as f:
+        return f.read(n)
+
+
 class DatasetProcessor:
     def __init__(
         self,
@@ -206,10 +216,99 @@ def upload_dataset(self, public=True) -> DatasetFile or None:
             temp_file_path = self.generate_temp_filename()
             file_sha256_hash, is_zip = self.download_content(temp_file_path)
             if not is_zip:
-                self.logger.error(
-                    f"[{self.feed_stable_id}] The downloaded file from {self.producer_url} is not a valid ZIP file."
+                # General guard for HTML/non-ZIP responses and a browser-like fallback download
+                first = peek_bytes(temp_file_path, 64)
+                looks_html = (
+                    first.strip().startswith(b"<!DOCTYPE") or b"<html" in first.lower()
                 )
-                return None
+                if looks_html:
+                    self.logger.warning(
+                        "[%s] Download returned HTML instead of ZIP. "
+                        "Retrying with browser-like headers and session.",
+                        self.feed_stable_id,
+                    )
+                    try:
+                        import requests
+                        from urllib.parse import urlparse
+
+                        parsed = urlparse(self.producer_url)
+                        origin = f"{parsed.scheme}://{parsed.netloc}"
+                        referer = origin + "/"
+                        dir_url = (
+                            (self.producer_url.rsplit("/", 1)[0] + "/")
+                            if "/" in self.producer_url
+                            else referer
+                        )
+                        headers = {
+                            "User-Agent": (
+                                "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
+                                "AppleWebKit/537.36 (KHTML, like Gecko) "
+                                "Chrome/124.0 Safari/537.36"
+                            ),
+                            "Accept": "*/*",
+                            "Accept-Language": "en-US,en;q=0.9",
+                            # Use site origin as a safe, general Referer (no path assumptions)
+                            "Referer": referer,
+                            "Connection": "keep-alive",
+                        }
+                        os.makedirs(
+                            os.path.dirname(temp_file_path) or ".", exist_ok=True
+                        )
+                        with requests.Session() as s:
+                            s.headers.update(headers)
+                            # Best-effort cookie priming on the site origin and the file's directory (non-fatal)
+                            try:
+                                s.get(referer, timeout=15, allow_redirects=True)
+                            except Exception:
+                                pass
+                            try:
+                                if dir_url != referer:
+                                    s.get(dir_url, timeout=15, allow_redirects=True)
+                            except Exception:
+                                pass
+                            with s.get(
+                                self.producer_url,
+                                stream=True,
+                                timeout=60,
+                                allow_redirects=True,
+                            ) as r:
+                                r.raise_for_status()
+                                ct = (r.headers.get("Content-Type") or "").lower()
+                                # Peek signature to guard against HTML interstitials
+                                first8 = r.raw.read(8, decode_content=True)
+                                if "text/html" in ct or not first8.startswith(b"PK"):
+                                    raise RuntimeError(
+                                        f"Unexpected response during fallback. "
+                                        f"Content-Type={ct!r}, first bytes={first8!r}"
+                                    )
+                                with open(temp_file_path, "wb") as out:
+                                    out.write(first8)
+                                    for chunk in r.iter_content(chunk_size=1024 * 1024):
+                                        if chunk:
+                                            out.write(chunk)
+                        if zipfile.is_zipfile(temp_file_path):
+                            # Recompute hash after successful fallback
+                            file_sha256_hash, is_zip = (
+                                self.compute_file_hash(temp_file_path),
+                                True,
+                            )
+                            self.logger.info(
+                                "[%s] Fallback download validated as ZIP.",
+                                self.feed_stable_id,
+                            )
+                        else:
+                            self.logger.error(
+                                "[%s] The downloaded file from %s is not a valid ZIP file.",
+                                self.feed_stable_id,
+                                self.producer_url,
+                            )
+                            return None
+                    except Exception as fallback_err:
+                        self.logger.error(
+                            "[%s] Browser-like fallback failed: %s",
+                            self.feed_stable_id,
+                            fallback_err,
+                        )
 
             self.logger.info(
                 f"[{self.feed_stable_id}] File hash is {file_sha256_hash}."

diff --git a/functions-python/batch_process_dataset/src/scripts/HTML_headers_verifier.sh b/functions-python/batch_process_dataset/src/scripts/HTML_headers_verifier.sh
@@ -0,0 +1,14 @@
+# Download while capturing headers and using a browser-like client identity
+#- Valid ZIP should begin with “PK..” (hex 50 4b 03 04 or 50 4b 05 06).
+#- If you see “<!DOCTYPE” or “<html…”, you got an HTML page (redirect, consent, 403, etc.).
+curl -sSL --fail \
+  -D /tmp/headers.txt \
+  -A "Mozilla/5.0" \
+  -H "Referer: https://www.stm.info/" \
+  -H "Accept: */*" \
+  -o /tmp/stm.zip \
+  "https://www.stm.info/sites/default/files/gtfs/gtfs_stm.zip"
+
+echo "== Headers ==" && cat /tmp/headers.txt
+echo "== Size ==" && wc -c /tmp/stm.zip
+echo "== First 64 bytes ==" && hexdump -C /tmp/stm.zip | head -n 4
diff --git a/functions-python/batch_process_dataset/src/scripts/headers_verifier.sh b/functions-python/batch_process_dataset/src/scripts/headers_verifier.sh
@@ -0,0 +1,11 @@
+# This is to inspect what the server actually returned: Save headers to see final status and Content-Type
+# If Content-Type is text/html or status is 30x/403, you’re not getting the ZIP.
+curl -sSL --fail \
+  -D /tmp/headers.txt \
+  -A "Mozilla/5.0" \
+  -o /tmp/stm.zip \
+  "https://www.stm.info/sites/default/files/gtfs/gtfs_stm.zip"
+
+echo "== Headers ==" && cat /tmp/headers.txt
+file /tmp/stm.zip
+hexdump -C /tmp/stm.zip | head
diff --git a/functions-python/batch_process_dataset/src/scripts/zip_verifier.sh b/functions-python/batch_process_dataset/src/scripts/zip_verifier.sh
@@ -0,0 +1,14 @@
+# Inspect headers and redirect chain (no file saved)
+curl -ILv "https://www.stm.info/sites/default/files/gtfs/gtfs_stm.zip"
+
+# Download the body (ZIP) — no -I, follow redirects, set UA
+curl -L --fail -A "Mozilla/5.0" \
+  -o /tmp/stm.zip \
+  "https://www.stm.info/sites/default/files/gtfs/gtfs_stm.zip"
+
+# Sanity checks
+file /tmp/stm.zip
+python - <<'PY'
+import zipfile
+print("is_zip:", zipfile.is_zipfile("/tmp/stm.zip"))
+PY