diff --git a/allofplos/__init__.py b/allofplos/__init__.py index a164f05a..6c9b2bf9 100644 --- a/allofplos/__init__.py +++ b/allofplos/__init__.py @@ -30,5 +30,5 @@ def get_corpus_dir(): # import after creating global variables that they may rely upon # (e.g., corpusdir) -from .article_class import Article +from .article import Article from .corpus import Corpus diff --git a/allofplos/article_class.py b/allofplos/article.py similarity index 98% rename from allofplos/article_class.py rename to allofplos/article.py index e116c4a7..bad63c3b 100644 --- a/allofplos/article_class.py +++ b/allofplos/article.py @@ -9,13 +9,14 @@ from . import get_corpus_dir from .transformations import (filename_to_doi, _get_base_page, LANDING_PAGE_SUFFIX, - URL_SUFFIX, plos_page_dict, doi_url) + URL_SUFFIX, plos_page_dict, doi_url, doi_to_url, doi_to_path) from .plos_regex import validate_doi from .elements import (parse_article_date, get_contrib_info, Journal, License, match_contribs_to_dicts) +from .utils import dedent -class Article(): +class Article: """The primary object of a PLOS article, initialized by a valid PLOS DOI. """ @@ -46,6 +47,62 @@ def __eq__(self, other): dir_eq = self.directory == other.directory return doi_eq and dir_eq + def __str__(self, exclude_refs=True): + """Output when you print an article object on the command line. + + For parsing and viewing the XML of a local article. Should not be used for hashing + Excludes element (including references list) for easier viewing + :param exclude_refs: remove references from the article tree (eases print viewing) + """ + parser = et.XMLParser(remove_blank_text=True) + tree = et.parse(self.filename, parser) + if exclude_refs: + root = tree.getroot() + back = tree.xpath('./back') + if back: + root.remove(back[0]) + local_xml = et.tostring(tree, + method='xml', + encoding='unicode', + pretty_print=True) + return local_xml + + def __repr__(self): + """Value of an article object when you call it directly on the command line. + + Shows the DOI and title of the article + :returns: DOI and title + :rtype: {str} + """ + out = "DOI: {0}\nTitle: {1}".format(self.doi, self.title) + return out + + + def _repr_html_(self): + """Nice display for Jupyter notebook""" + + titlestyle = 'display:inline-flex;' + titletextstyle = 'margin-left:.5em;' + titlelink = ('' + '{title}').format( + url=self.page, + title=self.title, + titlestyle=titlestyle+titletextstyle, + ) + + doilink = '{doi}'.format( + url=self.doi_link(), + doi=self.doi, + ) + out = dedent("""
+ Title: {titlelink}
+ DOI: {doilink} +
+ """).format(doilink=doilink, titlelink=titlelink, titlestyle=titlestyle) + + return out + + def reset_memoized_attrs(self): """Reset attributes to None when instantiating a new article object. @@ -111,34 +168,6 @@ def doi(self, d): self.reset_memoized_attrs() self._doi = d - def __str__(self, exclude_refs=True): - """Output when you print an article object on the command line. - - For parsing and viewing the XML of a local article. Should not be used for hashing - Excludes element (including references list) for easier viewing - :param exclude_refs: remove references from the article tree (eases print viewing) - """ - parser = et.XMLParser(remove_blank_text=True) - tree = et.parse(self.filename, parser) - if exclude_refs: - root = tree.getroot() - back = tree.xpath('./back') - root.remove(back[0]) - local_xml = et.tostring(tree, - method='xml', - encoding='unicode', - pretty_print=True) - return local_xml - - def __repr__(self): - """Value of an article object when you call it directly on the command line. - - Shows the DOI and title of the article - :returns: DOI and title - :rtype: {str} - """ - out = "DOI: {0}\nTitle: {1}".format(self.doi, self.title) - return out def doi_link(self): """The link of the DOI, which redirects to the journal URL.""" diff --git a/allofplos/corpus/plos_corpus.py b/allofplos/corpus/plos_corpus.py index f2c45231..648922e8 100644 --- a/allofplos/corpus/plos_corpus.py +++ b/allofplos/corpus/plos_corpus.py @@ -37,7 +37,7 @@ from ..plos_regex import validate_doi from ..transformations import (BASE_URL_API, filename_to_doi, doi_to_path, doi_to_url) -from ..article_class import Article +from ..article import Article from .gdrive import (download_file_from_google_drive, get_zip_metadata, unzip_articles, ZIP_ID, LOCAL_ZIP, LOCAL_TEST_ZIP, TEST_ZIP_ID, min_files_for_valid_corpus) diff --git a/allofplos/makedb.py b/allofplos/makedb.py index a57bb471..fc412d3a 100644 --- a/allofplos/makedb.py +++ b/allofplos/makedb.py @@ -20,7 +20,7 @@ from .corpus import Corpus from .transformations import filename_to_doi, convert_country from . import starterdir -from .article_class import Article +from .article import Article journal_title_dict = { 'PLOS ONE': 'PLOS ONE', diff --git a/allofplos/samples/corpus_analysis.py b/allofplos/samples/corpus_analysis.py index 36eb0863..0b80144c 100644 --- a/allofplos/samples/corpus_analysis.py +++ b/allofplos/samples/corpus_analysis.py @@ -22,7 +22,7 @@ from ..plos_corpus import (listdir_nohidden, uncorrected_proofs_text_list, download_updated_xml, get_all_solr_dois, download_check_and_move) -from ..article_class import Article +from ..article import Article counter = collections.Counter pmcdir = "pmc_articles" diff --git a/allofplos/tests/test_corpus.py b/allofplos/tests/test_corpus.py index 8bd2aedd..976931de 100644 --- a/allofplos/tests/test_corpus.py +++ b/allofplos/tests/test_corpus.py @@ -1,6 +1,6 @@ from . import TESTDATADIR from .. import Corpus, starterdir -from ..article_class import Article +from ..article import Article from ..corpus import listdir_nohidden import random diff --git a/allofplos/utils.py b/allofplos/utils.py new file mode 100644 index 00000000..02d1cbb8 --- /dev/null +++ b/allofplos/utils.py @@ -0,0 +1,27 @@ +import textwrap + +def dedent(text): + """Equivalent of textwrap.dedent that ignores unindented first line. + This means it will still dedent strings like: + '''foo + is a bar + ''' + For use in wrap_paragraphs. + + Taken from https://github.com/ipython/ipython_genutils/text.py + """ + + if text.startswith('\n'): + # text starts with blank line, don't ignore the first line + return textwrap.dedent(text) + + # split first line + splits = text.split('\n',1) + if len(splits) == 1: + # only one line + return textwrap.dedent(text) + + first, rest = splits + # dedent everything but the first line + rest = textwrap.dedent(rest) + return '\n'.join([first, rest])