Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion allofplos/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,5 +30,5 @@ def get_corpus_dir():
# import after creating global variables that they may rely upon
# (e.g., corpusdir)

from .article_class import Article
from .article import Article
from .corpus import Corpus
89 changes: 59 additions & 30 deletions allofplos/article_class.py → allofplos/article.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,14 @@

from . import get_corpus_dir
from .transformations import (filename_to_doi, _get_base_page, LANDING_PAGE_SUFFIX,
URL_SUFFIX, plos_page_dict, doi_url)
URL_SUFFIX, plos_page_dict, doi_url, doi_to_url, doi_to_path)
from .plos_regex import validate_doi
from .elements import (parse_article_date, get_contrib_info,
Journal, License, match_contribs_to_dicts)
from .utils import dedent


class Article():
class Article:
"""The primary object of a PLOS article, initialized by a valid PLOS DOI.

"""
Expand Down Expand Up @@ -46,6 +47,62 @@ def __eq__(self, other):
dir_eq = self.directory == other.directory
return doi_eq and dir_eq

def __str__(self, exclude_refs=True):
"""Output when you print an article object on the command line.

For parsing and viewing the XML of a local article. Should not be used for hashing
Excludes <back> element (including references list) for easier viewing
:param exclude_refs: remove references from the article tree (eases print viewing)
"""
parser = et.XMLParser(remove_blank_text=True)
tree = et.parse(self.filename, parser)
if exclude_refs:
root = tree.getroot()
back = tree.xpath('./back')
if back:
root.remove(back[0])
local_xml = et.tostring(tree,
method='xml',
encoding='unicode',
pretty_print=True)
return local_xml

def __repr__(self):
"""Value of an article object when you call it directly on the command line.

Shows the DOI and title of the article
:returns: DOI and title
:rtype: {str}
"""
out = "DOI: {0}\nTitle: {1}".format(self.doi, self.title)
return out


def _repr_html_(self):
"""Nice display for Jupyter notebook"""

titlestyle = 'display:inline-flex;'
titletextstyle = 'margin-left:.5em;'
titlelink = ('<span style="{titlestyle}"><a href="{url}">'
'<em>{title}</em></a></span>').format(
url=self.page,
title=self.title,
titlestyle=titlestyle+titletextstyle,
)

doilink = '<span><a href="{url}"><code>{doi}</code></a></span>'.format(
url=self.doi_link(),
doi=self.doi,
)
out = dedent("""<div>
<span style="{titlestyle}">Title: {titlelink}</span></br>
<span>DOI: <span>{doilink}
</div>
""").format(doilink=doilink, titlelink=titlelink, titlestyle=titlestyle)

return out


def reset_memoized_attrs(self):
"""Reset attributes to None when instantiating a new article object.

Expand Down Expand Up @@ -111,34 +168,6 @@ def doi(self, d):
self.reset_memoized_attrs()
self._doi = d

def __str__(self, exclude_refs=True):
"""Output when you print an article object on the command line.

For parsing and viewing the XML of a local article. Should not be used for hashing
Excludes <back> element (including references list) for easier viewing
:param exclude_refs: remove references from the article tree (eases print viewing)
"""
parser = et.XMLParser(remove_blank_text=True)
tree = et.parse(self.filename, parser)
if exclude_refs:
root = tree.getroot()
back = tree.xpath('./back')
root.remove(back[0])
local_xml = et.tostring(tree,
method='xml',
encoding='unicode',
pretty_print=True)
return local_xml

def __repr__(self):
"""Value of an article object when you call it directly on the command line.

Shows the DOI and title of the article
:returns: DOI and title
:rtype: {str}
"""
out = "DOI: {0}\nTitle: {1}".format(self.doi, self.title)
return out

def doi_link(self):
"""The link of the DOI, which redirects to the journal URL."""
Expand Down
2 changes: 1 addition & 1 deletion allofplos/corpus/plos_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@

from ..plos_regex import validate_doi
from ..transformations import (BASE_URL_API, filename_to_doi, doi_to_path, doi_to_url)
from ..article_class import Article
from ..article import Article
from .gdrive import (download_file_from_google_drive, get_zip_metadata, unzip_articles,
ZIP_ID, LOCAL_ZIP, LOCAL_TEST_ZIP, TEST_ZIP_ID, min_files_for_valid_corpus)

Expand Down
2 changes: 1 addition & 1 deletion allofplos/makedb.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from .corpus import Corpus
from .transformations import filename_to_doi, convert_country
from . import starterdir
from .article_class import Article
from .article import Article

journal_title_dict = {
'PLOS ONE': 'PLOS ONE',
Expand Down
2 changes: 1 addition & 1 deletion allofplos/samples/corpus_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
from ..plos_corpus import (listdir_nohidden, uncorrected_proofs_text_list,
download_updated_xml, get_all_solr_dois,
download_check_and_move)
from ..article_class import Article
from ..article import Article

counter = collections.Counter
pmcdir = "pmc_articles"
Expand Down
2 changes: 1 addition & 1 deletion allofplos/tests/test_corpus.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from . import TESTDATADIR
from .. import Corpus, starterdir
from ..article_class import Article
from ..article import Article
from ..corpus import listdir_nohidden

import random
Expand Down
27 changes: 27 additions & 0 deletions allofplos/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
import textwrap

def dedent(text):
"""Equivalent of textwrap.dedent that ignores unindented first line.
This means it will still dedent strings like:
'''foo
is a bar
'''
For use in wrap_paragraphs.

Taken from https://github.com/ipython/ipython_genutils/text.py
"""

if text.startswith('\n'):
# text starts with blank line, don't ignore the first line
return textwrap.dedent(text)

# split first line
splits = text.split('\n',1)
if len(splits) == 1:
# only one line
return textwrap.dedent(text)

first, rest = splits
# dedent everything but the first line
rest = textwrap.dedent(rest)
return '\n'.join([first, rest])