diff --git a/allofplos/article.py b/allofplos/article.py index 49ab4571..9a54ac29 100644 --- a/allofplos/article.py +++ b/allofplos/article.py @@ -1214,11 +1214,8 @@ def body(self): :rtype: {str} """ - xml_tree = et.parse(self.filename) - root = xml_tree.getroot() - # limit the text to the body section - body = root.find('./body') + body = self.root.find('./body') # remove supplementary material section for sec in body.findall('.//sec'): @@ -1378,3 +1375,69 @@ def from_filename(cls, filename): else: directory = None return cls(filename_to_doi(filename), directory=directory) + + # region: review_crawling2022 + @classmethod + def from_xml(cls, source, directory = None): + """Initiate an article object using an XML-encoded string. + Parses the XML to obtain the article's doi. + + :param source: string containing XML describing an article + :param directory: path to directory containing the XML for this article. Defaults to `get_corpus_dir()` via `Article().__init__`. + """ + root = et.fromstring(source) + doi = root.find("front//article-id[@pub-id-type='doi']").text.strip() + a = Article(doi, directory) + a.tree = root.getroottree() + return a + + @tree.setter + def tree(self, value): + """ + Set tree to the given object. + """ + assert isinstance(value, et._ElementTree) # TODO better validation? + self._tree = value + + def get_subarticles(self): + """Get sub-articles embedded in the XML tree of this article. + + :rtype: list + :return: list of lxml elements that are roots of each sub-article + """ + sub_articles = self.root.findall('sub-article') + return sub_articles # maybe return list of Articles instead? + + def get_author_names(self): + """ + Compresses the list of dicts stored in `self.authors` into a simpler list of author names. + + :rtype: list + """ + parsed_authors = [] + for author in self.authors: + if author['given_names'] is None and author['surname'] is None: + parsed_authors.append(author['group_name']) + else: + parsed_authors.append(author['given_names']+ ' ' +author['surname']) + return parsed_authors + + @property + def categories(self): + """ + Get the categories (or keywords) defined for this article. + + :rtype: list + """ + keywords_set = set() # using a set because they tend to be duplicated + categories = self.root.find('.//front').find('.//article-categories') + if categories is None: + return None + + for el in categories[1:]: # skipping the first one because it's a "heading" + for subj in el.iterdescendants(): + if len(subj) == 1: keywords_set.add(subj[0].text.strip()) + return list(keywords_set) + + + # endregion \ No newline at end of file diff --git a/allofplos/corpus/plos_corpus.py b/allofplos/corpus/plos_corpus.py index c651fdd0..c6802321 100644 --- a/allofplos/corpus/plos_corpus.py +++ b/allofplos/corpus/plos_corpus.py @@ -94,14 +94,14 @@ def download_corpus_zip(): return file_path -def unzip_articles(file_path): +def unzip_articles(file_path, extract_directory = get_corpus_dir(), delete_file=True): """ Unzips zip file of all of PLOS article XML to specified directory :param file_path: path to file to be extracted + :param extract_directory: directory where articles are copied to + :param delete_file: whether to delete the compressed archive after extracting articles :return: None """ - extract_directory = get_corpus_dir() - os.makedirs(extract_directory, exist_ok=True) with zipfile.ZipFile(file_path, "r") as zip_ref: @@ -110,7 +110,8 @@ def unzip_articles(file_path): zip_ref.extract(article, path=extract_directory) tqdm.write("Extraction complete.") - os.remove(file_path) + if delete_file: + os.remove(file_path) def listdir_nohidden(path, extension='.xml', include_dir=True): @@ -633,7 +634,7 @@ def download_check_and_move(article_list, proof_filepath, tempdir, destination): move_articles(tempdir, destination) -def create_local_plos_corpus(directory=None, rm_metadata=True): +def create_local_plos_corpus(directory=None, rm_metadata=True, unzip=True, delete_file=True): """ Downloads a fresh copy of the PLOS corpus by: 1) creating directory if it doesn't exist @@ -642,6 +643,8 @@ def create_local_plos_corpus(directory=None, rm_metadata=True): 3) extracting the individual XML files into the corpus directory :param directory: directory where the corpus is to be downloaded and extracted :param rm_metadata: COMPLETE HERE + :param unzip: whether to extract article files to corpus dir, or just keep the zip file instead. Defaults to `True` + :param delete_file: whether to delete the compressed archive after extracting articles. Defaults to `True` :return: None """ if directory is None: @@ -650,4 +653,6 @@ def create_local_plos_corpus(directory=None, rm_metadata=True): print('Creating folder for article xml') os.makedirs(directory, exist_ok=True) zip_path = download_corpus_zip() - unzip_articles(file_path=zip_path) + if unzip: + unzip_articles(file_path=zip_path, extract_directory=get_corpus_dir(), delete_file=delete_file) + diff --git a/allofplos/plos_regex.py b/allofplos/plos_regex.py index df6cd32c..3bbbc391 100644 --- a/allofplos/plos_regex.py +++ b/allofplos/plos_regex.py @@ -8,8 +8,9 @@ from . import get_corpus_dir regex_match_prefix = r"^10\.1371/" -regex_body_match = (r"((journal\.p[a-zA-Z]{3}\.[\d]{7}$)" +regex_body_match = (r"((journal\.p[a-zA-Z]{3}\.[\d]{7})" r"|(annotation/[a-zA-Z0-9]{8}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{12}$))") +regex_suffix_match = r"(\.[rs][0-9]{3})?" # matches reviews and supplementary materials regex_body_search = (r"((journal\.p[a-zA-Z]{3}\.[\d]{7})" r"|(annotation/[a-zA-Z0-9]{8}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{12}))") regex_body_currents = (r"((currents\.[a-zA-Z]{2,9}\.[a-zA-Z0-9]{32}$)" @@ -18,17 +19,20 @@ r"|([a-zA-Z0-9]{32}$))") regex_file_search = (r"((journal\.p[a-zA-Z]{3}\.[\d]{7})" r"|(plos\.correction\.[a-zA-Z0-9]{8}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{12}))") -full_doi_regex_match = re.compile(regex_match_prefix+regex_body_match) +full_doi_regex_match = re.compile(regex_match_prefix+regex_body_match+regex_suffix_match) full_doi_regex_search = re.compile(r"10\.1371/journal\.p[a-zA-Z]{3}\.[\d]{7}" "|10\.1371/annotation/[a-zA-Z0-9]{8}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{12}") currents_doi_regex = re.compile(regex_match_prefix+regex_body_currents) file_regex_match = re.compile(regex_file_search+r"\.xml") -BASE_URL = 'https://journals.plos.org/plosone/article/file?id=' -URL_SUFFIX = '&type=manuscript' -external_url_regex_match = re.compile(re.escape(BASE_URL) + - re.escape("10.1371/") + - regex_body_search + - re.escape(URL_SUFFIX)) +regex_type_match = r"(article)|(peerReview)" +regex_file_suffix = r"&type=((manuscript)|(supplementary))" + +BASE_URL = 'https://journals.plos.org/plosone/' +external_url_regex_match = re.compile(re.escape(BASE_URL) + re.escape("article/file?id=10.1371/") + + regex_body_search + regex_suffix_match + regex_file_suffix) +plos_url_regex_match = re.compile(re.escape("https://journals.plos.org/") + r"[a-z]+/" + + regex_type_match + re.escape("?id=10.1371/") + + regex_body_search + regex_suffix_match) def validate_doi(doi): @@ -56,14 +60,27 @@ def validate_filename(filename): return False -def validate_url(url): +def validate_file_url(url): """ - For an individual string, tests whether the full string is in a valid article url format or not + For an individual string, tests whether the full string is in a valid article (manuscript) url format or not Example: 'https://journals.plos.org/plosone/article/file?id=10.1371/journal.pcbi.0020147&type=manuscript' is True, but 'https://journals.plos.org/plosone/article/file?id=10.1371/journal.pcbi.0020147' is False - :return: True if string is in a valid PLOS article url; False if not + + Urls leading to files containing supplementary material are valid. + example: 'https://journals.plos.org/plosone/article/file?id=10.1371/journal.pone.0222522.s002&type=supplementary' is True + + :return: True if string is in a valid PLOS file url; False if not + """ + return bool(external_url_regex_match.match(url)) + + +def validate_plos_url(url): + """ + Tests whether the given `url` string is a valid PLOS website format. + + :return True if string is in a valid PLOS url; False otherwise """ - return bool(external_url_regex_match.search(url)) + return bool(plos_url_regex_match.search(url)) def find_valid_dois(doi): diff --git a/allofplos/samples/corpus_analysis.py b/allofplos/samples/corpus_analysis.py index c28f9b9e..0a3855f0 100644 --- a/allofplos/samples/corpus_analysis.py +++ b/allofplos/samples/corpus_analysis.py @@ -17,7 +17,7 @@ from .. import get_corpus_dir, newarticledir -from ..plos_regex import (validate_doi, full_doi_regex_match, validate_url, validate_filename) +from ..plos_regex import (validate_doi, full_doi_regex_match, validate_file_url, validate_filename) from ..transformations import (filename_to_doi, doi_to_url) from ..corpus.plos_corpus import (listdir_nohidden, uncorrected_proofs_text_list, download_updated_xml, get_all_solr_dois, @@ -49,7 +49,7 @@ def validate_corpus(directory=None): # check urls plos_urls = [doi_to_url(doi) for doi in plos_valid_dois] - plos_valid_urls = [url for url in plos_urls if validate_url(url)] + plos_valid_urls = [url for url in plos_urls if validate_file_url(url)] if set(plos_urls) == set(plos_valid_urls) and len(plos_valid_urls) == len(plos_valid_dois): pass else: diff --git a/allofplos/transformations.py b/allofplos/transformations.py index 0a98b2f9..ae146734 100644 --- a/allofplos/transformations.py +++ b/allofplos/transformations.py @@ -36,7 +36,8 @@ 'assetFile': 'article/file', 'assetXMLFile': 'article/file', 'articleMetrics': 'article/metrics', - 'articleRelated': 'article/related'} + 'articleRelated': 'article/related', + 'peerReview': 'article/peerReview'} def _get_base_page(journal): @@ -144,8 +145,8 @@ def url_to_doi(url): Example: url_to_path('https://journals.plos.org/plosone/article/file?id=10.1371/journal.pone.1000001') = \ '10.1371/journal.pone.1000001' - :param url: online location of a PLOS article's XML - :return: full unique identifier for a PLOS article + :param url: online location of a PLOS article's XML (not neccessarily, base link works fine too) + :return: full unique identifier for a PLOS article (or for a peer review, or supplementary material etc.) """ return url[url.index(PREFIX):].rstrip(URL_SUFFIX).rstrip(INT_URL_SUFFIX) diff --git a/tests/test_unittests.py b/tests/test_unittests.py index 9ecff9ad..ac601fa3 100644 --- a/tests/test_unittests.py +++ b/tests/test_unittests.py @@ -92,7 +92,7 @@ def test_class_doi1(self): self.assertEqual(article.dtd, "JATS 1.1d3", 'dtd does not transform correctly for {}'.format(article.doi)) self.assertEqual(article.editor, [{'contrib_initials': 'EGL', 'given_names': 'Eric Gordon', 'surname': 'Lamb', 'group_name': None, 'ids': [], 'rid_dict': {'aff': ['edit1']}, 'contrib_type': 'editor', 'author_type': None, 'editor_type': None, 'email': None, 'affiliations': ['University of Saskatchewan, CANADA'], 'author_roles': {None: ['Editor']}, 'footnotes': []}], 'editor does not transform correctly for {}'.format(article.doi)) article_relpath = os.path.relpath(article.filepath, TESTDIR) - self.assertEqual(article_relpath, "testdata/journal.pone.0185809.xml", 'filename does not transform correctly for {}'.format(article.doi)) + self.assertEqual(article_relpath, os.path.join("testdata","journal.pone.0185809.xml"), 'filename does not transform correctly for {}'.format(article.doi)) self.assertEqual(article.journal, "PLOS ONE", 'journal does not transform correctly for {}'.format(article.doi)) self.assertEqual(article.local, True, 'local does not transform correctly for {}'.format(article.doi)) self.assertEqual(article.page, "https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0185809", 'page does not transform correctly for {}'.format(article.doi)) @@ -133,7 +133,7 @@ def test_example_doi(self): self.assertEqual(article.dtd, "JATS 1.1d3", 'dtd does not transform correctly for {}'.format(article.doi)) self.assertEqual(article.editor, [], 'editor does not transform correctly for {}'.format(article.doi)) article_relpath = os.path.relpath(article.filepath, TESTDIR) - self.assertEqual(article_relpath, "testdata/journal.pbio.2001413.xml", 'filename does not transform correctly for {}'.format(article.doi)) + self.assertEqual(article_relpath, os.path.join("testdata","journal.pbio.2001413.xml"), 'filename does not transform correctly for {}'.format(article.doi)) self.assertEqual(article.journal, "PLOS Biology", 'journal does not transform correctly for {}'.format(article.doi)) self.assertEqual(article.local, True, 'local does not transform correctly for {}'.format(article.doi)) self.assertEqual(article.page, "https://journals.plos.org/plosbiology/article?id=10.1371/journal.pbio.2001413", 'page does not transform correctly for {}'.format(article.doi)) @@ -172,7 +172,7 @@ def test_example_doi2(self): self.assertEqual(article.dtd, "NLM 3.0", 'dtd does not transform correctly for {}'.format(article.doi)) self.assertEqual(article.editor, [], 'editor does not transform correctly for {}'.format(article.doi)) article_relpath = os.path.relpath(article.filepath, TESTDIR) - self.assertEqual(article_relpath, "testdata/plos.correction.3155a3e9-5fbe-435c-a07a-e9a4846ec0b6.xml", 'filename does not transform correctly for {}'.format(article.doi)) + self.assertEqual(article_relpath, os.path.join("testdata","plos.correction.3155a3e9-5fbe-435c-a07a-e9a4846ec0b6.xml"), 'filename does not transform correctly for {}'.format(article.doi)) self.assertEqual(article.journal, "PLOS ONE", 'journal does not transform correctly for {}'.format(article.doi)) self.assertEqual(article.local, True, 'local does not transform correctly for {}'.format(article.doi)) self.assertEqual(article.page, "https://journals.plos.org/plosone/article?id=10.1371/annotation/3155a3e9-5fbe-435c-a07a-e9a4846ec0b6", 'page does not transform correctly for {}'.format(article.doi))