Skip to content

Commit 8bbee78

Browse files
committed
Instantiate parsers only once
1 parent 0691799 commit 8bbee78

File tree

6 files changed

+119
-5
lines changed

6 files changed

+119
-5
lines changed

parsel/parser/__init__.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
from lxml import etree
2+
from lxml.etree import XMLParser as _UnsafeXMLParser
3+
from lxml.html import HTMLParser as _HTMLParser
4+
5+
6+
class _LXMLBaseParser(object):
7+
8+
def __init__(self, parser_cls):
9+
self._parser = parser_cls(recover=True, encoding='utf8')
10+
11+
def parse(self, text, base_url):
12+
body = text.strip().replace('\x00', '').encode('utf8') or b'<html/>'
13+
root = etree.fromstring(body, parser=self._parser, base_url=base_url)
14+
if root is None:
15+
root = etree.fromstring(b'<html/>', parser=self._parser,
16+
base_url=base_url)
17+
return root
18+
19+
20+
class HTMLParser(_LXMLBaseParser):
21+
22+
def __init__(self):
23+
super(HTMLParser, self).__init__(_HTMLParser)
24+
25+
26+
class _XMLParser(_UnsafeXMLParser):
27+
28+
def __init__(self, *args, **kwargs):
29+
kwargs.setdefault('resolve_entities', False)
30+
super(_XMLParser, self).__init__(*args, **kwargs)
31+
32+
33+
class XMLParser(_LXMLBaseParser):
34+
35+
def __init__(self):
36+
super(XMLParser, self).__init__(_XMLParser)

parsel/parser/html.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
from parsel.parser import HTMLParser
2+
3+
HTML_PARSER = HTMLParser()

parsel/parser/xml.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
from parsel.parser import XMLParser
2+
3+
XML_PARSER = XMLParser()

parsel/selector.py

Lines changed: 34 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,14 +3,39 @@
33
"""
44

55
import sys
6+
from importlib import import_module
7+
from warnings import warn
68

79
import six
8-
from lxml import etree, html
10+
from lxml import etree
911

1012
from .utils import flatten, iflatten, extract_regex, shorten
1113
from .csstranslator import HTMLTranslator, GenericTranslator
1214

1315

16+
def _load_object(path):
17+
"""Load an object given its absolute object path, and return it.
18+
19+
`path` can point to a class, function, variable or a class instance. For
20+
example: ``'parsel.parser.html.HTML_PARSER'``.
21+
"""
22+
23+
try:
24+
dot = path.rindex('.')
25+
except ValueError:
26+
raise ValueError("Error loading object '%s': not a full path" % path)
27+
28+
module, name = path[:dot], path[dot+1:]
29+
mod = import_module(module)
30+
31+
try:
32+
obj = getattr(mod, name)
33+
except AttributeError:
34+
raise NameError("Module '%s' doesn't define any object named '%s'" % (module, name))
35+
36+
return obj
37+
38+
1439
class CannotRemoveElementWithoutRoot(Exception):
1540
pass
1641

@@ -21,14 +46,16 @@ class CannotRemoveElementWithoutParent(Exception):
2146

2247
class SafeXMLParser(etree.XMLParser):
2348
def __init__(self, *args, **kwargs):
49+
warn('parsel.selector.SafeXMLParser is deprecated',
50+
DeprecationWarning, stacklevel=2)
2451
kwargs.setdefault('resolve_entities', False)
2552
super(SafeXMLParser, self).__init__(*args, **kwargs)
2653

2754
_ctgroup = {
28-
'html': {'_parser': html.HTMLParser,
55+
'html': {'_parser': 'parsel.parser.html.HTML_PARSER',
2956
'_csstranslator': HTMLTranslator(),
3057
'_tostring_method': 'html'},
31-
'xml': {'_parser': SafeXMLParser,
58+
'xml': {'_parser': 'parsel.parser.xml.XML_PARSER',
3259
'_csstranslator': GenericTranslator(),
3360
'_tostring_method': 'xml'},
3461
}
@@ -46,6 +73,8 @@ def _st(st):
4673
def create_root_node(text, parser_cls, base_url=None):
4774
"""Create root node for text using given parser class.
4875
"""
76+
warn('parsel.selector.create_root_node is deprecated',
77+
DeprecationWarning, stacklevel=2)
4978
body = text.strip().replace('\x00', '').encode('utf8') or b'<html/>'
5079
parser = parser_cls(recover=True, encoding='utf8')
5180
root = etree.fromstring(body, parser=parser, base_url=base_url)
@@ -198,7 +227,7 @@ class Selector(object):
198227
def __init__(self, text=None, type=None, namespaces=None, root=None,
199228
base_url=None, _expr=None):
200229
self.type = st = _st(type or self._default_type)
201-
self._parser = _ctgroup[st]['_parser']
230+
self._parser = _load_object(_ctgroup[st]['_parser'])
202231
self._csstranslator = _ctgroup[st]['_csstranslator']
203232
self._tostring_method = _ctgroup[st]['_tostring_method']
204233

@@ -219,7 +248,7 @@ def __getstate__(self):
219248
raise TypeError("can't pickle Selector objects")
220249

221250
def _get_root(self, text, base_url=None):
222-
return create_root_node(text, self._parser, base_url=base_url)
251+
return self._parser.parse(text=text, base_url=base_url)
223252

224253
def xpath(self, query, namespaces=None, **kwargs):
225254
"""

tests/test_deprecations.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
# -*- coding:utf-8 -*-
2+
3+
4+
from unittest import TestCase
5+
from warnings import catch_warnings
6+
7+
from parsel.selector import create_root_node, SafeXMLParser
8+
from lxml.html import HTMLParser
9+
10+
11+
class TestDeprecations(TestCase):
12+
13+
def test_create_root_node(self):
14+
with catch_warnings(record=True) as warnings:
15+
create_root_node(u'…', HTMLParser)
16+
self.assertEqual(len(warnings), 1)
17+
18+
def test_SafeXMLParser(self):
19+
with catch_warnings(record=True) as warnings:
20+
parser = SafeXMLParser()
21+
self.assertEqual(len(warnings), 1)

tests/test_selector.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77

88
from parsel import Selector
99
from parsel.selector import (
10+
_load_object,
1011
CannotRemoveElementWithoutRoot,
1112
CannotRemoveElementWithoutParent,
1213
)
@@ -913,3 +914,24 @@ def test_set(self):
913914
//div[@itemtype="http://schema.org/Event"]
914915
//*[@itemscope]/*/@itemprop)''').extract(),
915916
[u'url', u'name', u'startDate', u'location', u'offers'])
917+
918+
919+
try:
920+
ModuleNotFoundError
921+
except NameError:
922+
ModuleNotFoundError = ImportError
923+
924+
925+
class LoadObjectTestCase(unittest.TestCase):
926+
927+
def test_incomplete_path(self):
928+
with self.assertRaises(ValueError):
929+
object = _load_object('parsel')
930+
931+
def test_inexistent_module(self):
932+
with self.assertRaises(ModuleNotFoundError):
933+
object = _load_object('parsel.inexistent.inexistent')
934+
935+
def test_inexistent_object(self):
936+
with self.assertRaises(NameError):
937+
object = _load_object('parsel.parser.inexistent')

0 commit comments

Comments
 (0)