diff --git a/Parsers/parse_litcovid_for_vespa.py b/Parsers/parse_litcovid_for_vespa.py new file mode 100644 index 0000000..3cba916 --- /dev/null +++ b/Parsers/parse_litcovid_for_vespa.py @@ -0,0 +1,403 @@ +import pymongo +import os +import datetime +from pprint import pprint +from collections import defaultdict +import datetime +import requests +import csv +import difflib +import time +import xml.etree.ElementTree as ET +import json +import xmltodict +import calendar + +#client = pymongo.MongoClient(os.getenv("COVID_HOST"), username=os.getenv("COVID_USER"), +# password=os.getenv("COVID_PASS"), authSource=os.getenv("COVID_DB")) + +#db = client[os.getenv("COVID_DB")] + + +def get_LitCovid_Data(): # Collect most recent data file directly from LitCovid website + titles_pmids_journals = [] + session = requests.Session() + LitCovidData_url = 'https://www.ncbi.nlm.nih.gov/research/coronavirus-api/export/tsv?' + response = session.get(LitCovidData_url) + data = response.content.decode('utf-8') + data = data.split('\n') + data = data[32:] + for row in data: + if row != '': + row = row.split('\t') + article = dict() + article['pmid'] = row[0] + article['title'] = row[1] + article['journal'] = row[2] + titles_pmids_journals.append(article) + return titles_pmids_journals + +def pmid2doi(pmid): # Get doi with given pmid (returns None if id cannot be converted) + session = requests.Session() + pmid2doi_url = 'https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/?ids=%s' % pmid + response = session.get(pmid2doi_url) + root = ET.fromstring(response.content) + for record in root.iter('record'): + if 'doi' in root.find('record').attrib: + doi = root.find('record').attrib['doi'] + return doi + else: + return + +def clean_title(title): # Clean input title for title2doi conversion + if title.startswith('['): + title = title[1:-1] + if title.endswith('.'): + title = title[0:-1] + return title + +def title2doi(title): # Collect doi for article based on title if doi is not available + session = requests.Session() + title = clean_title(title) + crossref_url = 'https://api.crossref.org/works?query=%s' % title.replace(' ', '%20') + try: + response = session.get(crossref_url) + except: + print('request to cross_ref failed!') + return + try: + response = response.json() + cr_response = response['message'] + items = cr_response['items'] + + # Title + top_title = items[0]['title'][0] + if top_title.endswith(' (Preprint)'): + top_title = top_title.replace(' (Preprint)', '') + if u"\u2013" in top_title: + top_title = top_title.replace(u"\u2013", '-') + if u"\u2019" in top_title: + top_title = top_title.replace(u"\u2019", "'") + + # check if top query matches the given title... if it does, return DOI, if it doesn't, return None + if top_title == title: + return items[0]['DOI'] + else: + return + except: + print("Couldn't find title on crossref") + +def crossref_get(doi): # Scrape raw metadata (json format) from crossref if EFetch fails + session = requests.Session() + crossref_url = 'https://api.crossref.org/works/%s' % doi + try: + response = session.get(crossref_url) + except: + print('request to cross_ref failed!') + return + try: + response = response.json() + cr_response = response['message'] + return cr_response + except: + return + +def pubmed_get(pmid): # Scrape raw metadata (xml format) from pubmed with EFetch + session = requests.Session() + pubmed_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id=%s&retmode=xml' % pmid + try: + response = session.get(pubmed_url) + except: + print('Request to PubMed failed!') + return + try: + root = ET.fromstring(response.content) + article = root.find('PubmedArticle').find('MedlineCitation').find('Article') + return response + except: + return + +def crossref_parse(response, pmid, journal): # Parse raw metadata from crossref result for Vespa + + cr_metadata = dict() + + #title + cr_metadata['title'] = response['title'] + + #source + cr_metadata['source'] = 'PubMed' + + #license + cr_metadata['license'] = None + + #datestring + formatted_date = "" + date = response['issued']['date-parts'] + if len(date[0]) == 1 and date[0] != None: + formatted_date = "{0}".format(date[0][0]) + cr_metadata['datestring'] = datetime.datetime.strptime(formatted_date, "%Y") + elif len(date[0]) == 2: + formatted_date = "{0}-{1}".format(date[0][0], date[0][1]) + cr_metadata['datestring'] = datetime.datetime.strptime(formatted_date, "%Y-%m") + else: + formatted_date = "{0}-{1}-{2}".format(date[0][0], date[0][1], date[0][2]) + cr_metadata['datestring'] = datetime.datetime.strptime(formatted_date, "%Y-%m-%d") + + #doi, pmcid, and pubmed_id + cr_metadata['doi'] = response['DOI'] + cr_metadata['pmcid'] = None + cr_metadata['pubmed_id'] = pmid + + #url + if 'link' in response: + cr_metadata['link'] = response['link'] + else: + cr_metadata['link'] = 'https://doi.org/' + response['DOI'] + + #cord_uid + cr_metadata['cord_uid'] = None + + #authors + if 'author' in response: + authors = [] + for author in response['author']: + if 'given' in author: + name = '{0} {1}'.format(author['given'], author['family']) + first = author['given'] + last = author['family'] + else: + name = '{0}'.format(author['family']) + last = author['family'] + affiliation = author['affiliation'] + authors.append({'name' : name}) + + cr_metadata['authors'] = authors + + #bib_entries + if 'reference' in response: + bibs = [] + references = response['reference'] + i = 0 + for citation in references: + bib = dict() + bib['ref_id'] = 'b{0}'.format(i) + if 'article-title' in citation.keys(): + bib['title'] = citation['article-title'] + if 'year' in citation.keys(): + bib['year'] = citation['year'] + if 'ISSN' in citation.keys(): + bib['issn'] = citation['ISSN'] + + #abstract + if 'abstract' in response: + cr_metadata['abstract'] = response['abstract'] + + #journal + if journal != None: + cr_metadata['journal'] = journal + elif 'container-title' in response.keys(): + cr_metadata['journal'] = response['container-title'] + elif 'short-container-title' in response.keys(): + cr_metadata['journal'] = response['short-container-title'] + + #body_text + cr_metadata['body_text'] = None + + #conclusion + cr_metadata['conclusion'] = None + + #introduction + cr_metadata['introduction'] = None + + #results + cr_metadata['results'] = None + + #discussion + cr_metadata['discussion'] = None + + #methods + cr_metadata['methods'] = None + + #background + cr_metadata['background'] = None + + #timestamp + try: + timestamp = int(datetime.datetime.strptime(formatted_date, '%Y-%m-%d').timestamp()) + except: + pass + + #who_covidence + cr_metadata['who_covidence'] = None + + #has_full_text + cr_metadata['has_full_text'] = None + + #dataset_version + cr_metadata['dataset_version'] = datetime.datetime.now().timestamp() + + return cr_metadata + +def pubmed_parse(response, pmid, journal): # Parse raw metadata from PubMed EFetch result for Vespa + + xml = ET.fromstring(response.content) + article = xml.find('PubmedArticle').find('MedlineCitation').find('Article') + + pubmed_metadata = dict() + + #title + pubmed_metadata['title'] = article.find('ArticleTitle').text + + #source + pubmed_metadata['source'] = 'PubMed' + + #license + pubmed_metadata['license'] = None + + #datestring + ArticleDate = article.find('ArticleDate') + if ArticleDate: + if ArticleDate.find('Day') != None: + day = ArticleDate.find('Day').text + month = ArticleDate.find('Month').text + if len(month) == 3: + month = list(calendar.month_abbr).index(month) + year = ArticleDate.find('Year').text + formatted_date = "{0}-{1}-{2}".format(year, month, day) + pubmed_metadata['datestring'] = datetime.datetime.strptime(formatted_date, "%Y-%m-%d") + elif ArticleDate.find('Month') != None: + month = ArticleDate.find('Month').text + if len(month) == 3: + month = list(calendar.month_abbr).index(month) + year = ArticleDate.find('Year').text + formatted_date = "{0}, {1}".format(year, month) + pubmed_metadata['datestring'] = datetime.datetime.strptime(formatted_date, "%Y-%m") + elif ArticleDate.find('Year') != None: + year = ArticleDate.find('Year').text + formatted_date = "{0}".format(year) + pubmed_metadata['datestring'] = datetime.datetime.strptime(formatted_date, "%Y") + + #doi, pmcid, and pubmed_id + IDs = xml.find('PubmedArticle').find('PubmedData').find('ArticleIdList') + if IDs: + for id in IDs.iter('ArticleId'): + if id.attrib['IdType'] == 'doi': + pubmed_metadata['doi'] = id.text + if id.attrib['IdType'] == 'pmcid': + pubmed_metadata['pmcid'] = id.text + pubmed_metadata['pubmed_id'] = pmid + + #url + if 'doi' in pubmed_metadata.keys(): + pubmed_metadata['link'] = 'https://doi.org/' + pubmed_metadata['doi'] + + #authors + authors = article.find('AuthorList') + if authors: + if authors.attrib['CompleteYN'] == 'Y': + pubmed_metadata['authors'] = [] + for author in authors.iter('Author'): + if author.attrib['ValidYN'] == 'Y': + author_info = dict() + if author.find('ForeName') != None and author.find('LastName') != None: + first = author.find('ForeName').text + last = author.find('LastName').text + name = first + last + author_info['Name'] = u'{0} {1}'.format(first, last) + pubmed_metadata['authors'].append(author_info) + + #bib_entries + pubmed_metadata['bib_entries'] = None + + #abstract + if article.find('Abstract') != None: + abstract_sections = article.find('Abstract') + abstract = "" + for section in abstract_sections.iter('AbstractText'): + if 'Label' in section.attrib: + heading = section.attrib['Label'] + ': ' + abstract = abstract + heading + section.text + ' ' + elif section.text != None: + abstract = abstract + section.text + pubmed_metadata['abstract'] = abstract + + #journal + if journal != None: + pubmed_metadata['journal'] = journal + else: + pubmed_metadata['journal'] = article.find('Journal').find('Title').text + + #body_text + pubmed_metadata['body_text'] = None + + #conclusion + pubmed_metadata['conclusion'] = None + + #introduction + pubmed_metadata['introduction'] = None + + #results + pubmed_metadata['results'] = None + + #discussion + pubmed_metadata['discussion'] = None + + #methods + pubmed_metadata['methods'] = None + + #background + pubmed_metadata['background'] = None + + #Pubmed id + pubmed_metadata['pubmed_id'] = pmid + + #timestamp + try: + timestamp = int(datetime.datetime.strptime(formatted_date, '%Y-%m-%d').timestamp()) + except: + pass + + #who_covidence + pubmed_metadata['who_covidence'] = None + + #has_full_text + pubmed_metadata['has_full_text'] = None + + #dataset_version + pubmed_metadata['dataset_version'] = datetime.datetime.now().timestamp() + + return pubmed_metadata + + +def vespa_litcovid_scrape_and_parse(): + titles_pmids_journals = get_LitCovid_Data() # get snapshot of LitCovid data + for article in titles_pmids_journals: + # need to add extra condition to check if article already exists in database + # once LitCovid articles are entered + + # scraping and parsing below assumes there are 4 target MongoDB collections: + # LitCovid_pubmed_xml + # LitCovid_crossref + # Vespa_LitCovid_pubmed_parsed + # Vespa_LitCovid_crossref_parsed + pmid = article['pmid'] + pubmed_metadata_xml = pubmed_get(pmid) # get xml metadata from pubmed EFetch + vespa_pubmed_parsed_data = pubmed_parse(pubmed_metadata_xml, pmid, article['journal']) + # probably best to add both raw and parsed data to MongoDB collection here... something like: + # db.LitCovid_pubmed_xml.insert_one(pubmed_metadata_xml) + # db.Vespa_LitCovid_pubmed_parsed.insert_one(vespa_pubmed_parsed_data) + if pmid2doi(pmid) != None: + doi = pmid2doi(pmid) + crossref_metadata_json = crossref_get(doi) # get json metadata from crossref + vespa_crossref_parsed_data = crossref_parse(crossref_metadata_json, pmid, article['journal']) + # probably best to add both raw and parsed data to MongoDB collection here...something like: + # db.LitCovid_crossref.insert_one(crossref_metadata_json) + # db.Vespa_LitCovid_crossref_parsed.insert_one(vespa_crossref_parsed_data) + elif title2doi(article['title']) != None: + doi = title2doi(article['title']) + crossref_metadata_json = crossref_get(doi) # get json metadata from crossref + vespa_crossref_parsed_data = crossref_parse(crossref_metadata_json, pmid, article['journal']) + # probably best to add both raw and parsed data to MongoDB collection here...something like: + # db.LitCovid_crossref.insert_one(crossref_metadata_json) + # db.Vespa_LitCovid_crossref_parsed.insert_one(vespa_crossref_parsed_data) + return