✨ Be more flexible about recordIdentifiers

mikegerber · mikegerber · commit 2af30598bdf5 · 2025-08-08T12:06:35.000+02:00
diff --git a/src/mods4pandas/mods4pandas.py b/src/mods4pandas/mods4pandas.py
@@ -169,21 +169,25 @@ def no_uuid(record_identifier):
             # By default we assume source="gbv-ppn" mods:recordIdentifiers (= PPNs),
             # however, in mods:relatedItems, there may be source="dnb-ppns",
             # which we need to distinguish by using a separate field name.
-            try:
-                value["recordIdentifier"] = (
-                    TagGroup(tag, group)
-                    .filter(no_uuid)
-                    .is_singleton()
-                    .has_attributes({"source": "gbv-ppn"})
-                    .text()
-                )
-            except ValueError:
-                value["recordIdentifier-dnb-ppn"] = (
-                    TagGroup(tag, group)
-                    .is_singleton()
-                    .has_attributes({"source": "dnb-ppn"})
-                    .text()
-                )
+
+            for field_name, source in \
+                ("recordIdentifier",         "gbv-ppn"), \
+                ("recordIdentifier-dnb-ppn", "dnb-ppn"), \
+                ("recordIdentifier-zdb",     "zdb"):
+                try:
+                    value[field_name] = (
+                        TagGroup(tag, group)
+                        .filter(no_uuid)
+                        .fix_recordIdentifier_source_zdb()
+                        .is_singleton()
+                        .has_attributes({"source": source})
+                        .text()
+                    )
+                    break
+                except ValueError as e:
+                    pass
+            if field_name not in value:
+                raise ValueError("Unknown recordIdentifier found")
         elif tag == "{http://www.loc.gov/mods/v3}identifier":
             for e in group:
                 if len(e.attrib) != 1:
@@ -634,11 +638,18 @@ def process(mets_files: list[str], output_file: str, output_page_info: str, mets
                 logger.exception("Exception in {}".format(mets_file))
 
     logger.info("Writing DataFrame to {}".format(output_file))
-    try:
-        convert_db_to_parquet(con, "mods_info", "recordInfo_recordIdentifier", output_file)
-    except:
-        # FIXME: Fix missing mods:recordInfo instead, https://github.com/qurator-spk/mods4pandas/issues/60
-        convert_db_to_parquet(con, "mods_info", "recordIdentifier", output_file)
+    considered_indexes = ("recordInfo_recordIdentifier", "recordIdentifier-zdb")
+    success = False
+    for considered_index in considered_indexes:
+        try:
+            convert_db_to_parquet(con, "mods_info", considered_index, output_file)
+            success = True
+            break
+        except:
+            pass
+    if not success:
+        raise ValueError(f"None of {considered_indexes} found")
+
     if output_page_info:
         logger.info("Writing DataFrame to {}".format(output_page_info))
         convert_db_to_parquet(