Skip to content

Commit 2b7dd00

Browse files
authored
Merge pull request #43 from scrapinghub/browser-html
BrowserHtml
2 parents 4f03f0b + df68a24 commit 2b7dd00

File tree

6 files changed

+71
-34
lines changed

6 files changed

+71
-34
lines changed

tests/test_page_inputs.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,15 @@
44
import pytest
55
import requests
66

7+
import parsel
78
from web_poet.page_inputs import (
89
HttpRequest,
910
HttpResponse,
1011
HttpRequestBody,
1112
HttpResponseBody,
1213
HttpRequestHeaders,
1314
HttpResponseHeaders,
15+
BrowserHtml,
1416
)
1517

1618

@@ -421,3 +423,14 @@ def test_html5_meta_charset():
421423
response = HttpResponse("http://www.example.com", body=body)
422424
assert response.encoding == 'gb18030'
423425
assert response.text == body.decode('gb18030')
426+
427+
428+
def test_browser_html():
429+
src = "<html><body><p>Hello, </p><p>world!</p></body></html>"
430+
html = BrowserHtml(src)
431+
assert html == src
432+
assert html != "foo"
433+
434+
assert html.xpath("//p/text()").getall() == ["Hello, ", "world!"]
435+
assert html.css("p::text").getall() == ["Hello, ", "world!"]
436+
assert isinstance(html.selector, parsel.Selector)

web_poet/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
HttpResponseHeaders,
1010
HttpRequestBody,
1111
HttpResponseBody,
12+
BrowserHtml,
1213
)
1314
from .overrides import PageObjectRegistry, consume_modules, OverrideRule
1415

web_poet/mixins.py

Lines changed: 40 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,48 @@
1+
import abc
12
from urllib.parse import urljoin
23

34
import parsel
45
from w3lib.html import get_base_url
56

67

7-
class ResponseShortcutsMixin:
8+
class SelectableMixin(abc.ABC):
9+
"""
10+
Inherit from this mixin, implement ``._selector_input`` method,
11+
get ``.selector`` property and ``.xpath`` / ``.css`` methods.
12+
"""
13+
__cached_selector = None
14+
15+
@abc.abstractmethod
16+
def _selector_input(self) -> str:
17+
raise NotImplementedError() # pragma: nocover
18+
19+
@property
20+
def selector(self) -> parsel.Selector:
21+
"""Cached instance of :external:class:`parsel.selector.Selector`."""
22+
# XXX: caching is implemented in a manual way to avoid issues with
23+
# non-hashable classes, where memoizemethod_noargs doesn't work
24+
if self.__cached_selector is not None:
25+
return self.__cached_selector
26+
# XXX: should we pass base_url=self.url, as Scrapy does?
27+
sel = parsel.Selector(text=self._selector_input())
28+
self.__cached_selector = sel
29+
return sel
30+
31+
def xpath(self, query, **kwargs):
32+
"""A shortcut to ``.selector.xpath()``."""
33+
return self.selector.xpath(query, **kwargs)
34+
35+
def css(self, query):
36+
"""A shortcut to ``.selector.css()``."""
37+
return self.selector.css(query)
38+
39+
40+
# TODO: when dropping Python 3.7 support,
41+
# fix untyped ResponseShortcutsMixin.response using typing.Protocol
42+
43+
class ResponseShortcutsMixin(SelectableMixin):
844
"""Common shortcut methods for working with HTML responses.
45+
This mixin could be used with Page Object base classes.
946
1047
It requires "response" attribute to be present.
1148
"""
@@ -21,20 +58,8 @@ def html(self):
2158
"""Shortcut to HTML Response's content."""
2259
return self.response.text
2360

24-
@property
25-
def selector(self) -> parsel.Selector:
26-
"""``parsel.Selector`` instance for the HTML Response."""
27-
# TODO: when dropping Python 3.7 support,
28-
# implement it using typing.Protocol
29-
return self.response.selector # type: ignore
30-
31-
def xpath(self, query, **kwargs):
32-
"""Run an XPath query on a response, using :class:`parsel.Selector`."""
33-
return self.selector.xpath(query, **kwargs)
34-
35-
def css(self, query):
36-
"""Run a CSS query on a response, using :class:`parsel.Selector`."""
37-
return self.selector.css(query)
61+
def _selector_input(self) -> str:
62+
return self.html
3863

3964
@property
4065
def base_url(self) -> str:

web_poet/page_inputs/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,3 +8,4 @@
88
HttpRequestBody,
99
HttpResponseBody,
1010
)
11+
from .browser import BrowserHtml

web_poet/page_inputs/browser.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
from web_poet.mixins import SelectableMixin
2+
3+
4+
class BrowserHtml(SelectableMixin, str):
5+
""" HTML returned by a web browser,
6+
i.e. snapshot of the DOM tree in HTML format.
7+
"""
8+
def _selector_input(self) -> str:
9+
return self
10+

web_poet/page_inputs/http.py

Lines changed: 6 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,7 @@
1-
import attrs
21
import json
3-
import parsel
42
from typing import Optional, Dict, List, Type, TypeVar, Union, Tuple, AnyStr
53

4+
import attrs
65
from w3lib.encoding import (
76
html_to_unicode,
87
html_body_declared_encoding,
@@ -12,6 +11,7 @@
1211

1312
from web_poet._base import _HttpHeaders
1413
from web_poet.utils import memoizemethod_noargs
14+
from web_poet.mixins import SelectableMixin
1515

1616
T_headers = TypeVar("T_headers", bound="HttpResponseHeaders")
1717

@@ -163,7 +163,7 @@ class HttpRequest:
163163

164164

165165
@attrs.define(auto_attribs=False, slots=False, eq=False)
166-
class HttpResponse:
166+
class HttpResponse(SelectableMixin):
167167
"""A container for the contents of a response, downloaded directly using an
168168
HTTP client.
169169
@@ -213,6 +213,9 @@ def text(self) -> str:
213213
self._cached_text = text
214214
return self._cached_text
215215

216+
def _selector_input(self) -> str:
217+
return self.text
218+
216219
@property
217220
def encoding(self):
218221
""" Encoding of the response """
@@ -223,22 +226,6 @@ def encoding(self):
223226
or self._body_inferred_encoding()
224227
)
225228

226-
# XXX: see https://github.com/python/mypy/issues/1362
227-
@property # type: ignore
228-
@memoizemethod_noargs
229-
def selector(self) -> parsel.Selector:
230-
"""Cached instance of :external:class:`parsel.selector.Selector`."""
231-
# XXX: should we pass base_url=self.url, as Scrapy does?
232-
return parsel.Selector(text=self.text)
233-
234-
def xpath(self, query, **kwargs):
235-
"""A shortcut to ``HttpResponse.selector.xpath()``."""
236-
return self.selector.xpath(query, **kwargs)
237-
238-
def css(self, query):
239-
"""A shortcut to ``HttpResponse.selector.css()``."""
240-
return self.selector.css(query)
241-
242229
@memoizemethod_noargs
243230
def json(self):
244231
""" Deserialize a JSON document to a Python object. """

0 commit comments

Comments
 (0)