Skip to content

Commit 2af3059

Browse files
committed
✨ Be more flexible about recordIdentifiers
1 parent 0855ccb commit 2af3059

File tree

1 file changed

+31
-20
lines changed

1 file changed

+31
-20
lines changed

src/mods4pandas/mods4pandas.py

Lines changed: 31 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -169,21 +169,25 @@ def no_uuid(record_identifier):
169169
# By default we assume source="gbv-ppn" mods:recordIdentifiers (= PPNs),
170170
# however, in mods:relatedItems, there may be source="dnb-ppns",
171171
# which we need to distinguish by using a separate field name.
172-
try:
173-
value["recordIdentifier"] = (
174-
TagGroup(tag, group)
175-
.filter(no_uuid)
176-
.is_singleton()
177-
.has_attributes({"source": "gbv-ppn"})
178-
.text()
179-
)
180-
except ValueError:
181-
value["recordIdentifier-dnb-ppn"] = (
182-
TagGroup(tag, group)
183-
.is_singleton()
184-
.has_attributes({"source": "dnb-ppn"})
185-
.text()
186-
)
172+
173+
for field_name, source in \
174+
("recordIdentifier", "gbv-ppn"), \
175+
("recordIdentifier-dnb-ppn", "dnb-ppn"), \
176+
("recordIdentifier-zdb", "zdb"):
177+
try:
178+
value[field_name] = (
179+
TagGroup(tag, group)
180+
.filter(no_uuid)
181+
.fix_recordIdentifier_source_zdb()
182+
.is_singleton()
183+
.has_attributes({"source": source})
184+
.text()
185+
)
186+
break
187+
except ValueError as e:
188+
pass
189+
if field_name not in value:
190+
raise ValueError("Unknown recordIdentifier found")
187191
elif tag == "{http://www.loc.gov/mods/v3}identifier":
188192
for e in group:
189193
if len(e.attrib) != 1:
@@ -634,11 +638,18 @@ def process(mets_files: list[str], output_file: str, output_page_info: str, mets
634638
logger.exception("Exception in {}".format(mets_file))
635639

636640
logger.info("Writing DataFrame to {}".format(output_file))
637-
try:
638-
convert_db_to_parquet(con, "mods_info", "recordInfo_recordIdentifier", output_file)
639-
except:
640-
# FIXME: Fix missing mods:recordInfo instead, https://github.com/qurator-spk/mods4pandas/issues/60
641-
convert_db_to_parquet(con, "mods_info", "recordIdentifier", output_file)
641+
considered_indexes = ("recordInfo_recordIdentifier", "recordIdentifier-zdb")
642+
success = False
643+
for considered_index in considered_indexes:
644+
try:
645+
convert_db_to_parquet(con, "mods_info", considered_index, output_file)
646+
success = True
647+
break
648+
except:
649+
pass
650+
if not success:
651+
raise ValueError(f"None of {considered_indexes} found")
652+
642653
if output_page_info:
643654
logger.info("Writing DataFrame to {}".format(output_page_info))
644655
convert_db_to_parquet(

0 commit comments

Comments
 (0)