From c2cb35a51cfb49c14ae7ce879a54a68e24ebdc83 Mon Sep 17 00:00:00 2001 From: Mikhail Korobov Date: Sun, 22 May 2022 22:53:43 +0500 Subject: [PATCH 1/7] SelectableMixin with .selector property and .css/.xpath methods --- web_poet/mixins.py | 58 +++++++++++++++++++++++++++--------- web_poet/page_inputs/http.py | 22 ++------------ 2 files changed, 47 insertions(+), 33 deletions(-) diff --git a/web_poet/mixins.py b/web_poet/mixins.py index 3280bede..ca03a719 100644 --- a/web_poet/mixins.py +++ b/web_poet/mixins.py @@ -1,11 +1,50 @@ +import abc from urllib.parse import urljoin import parsel from w3lib.html import get_base_url -class ResponseShortcutsMixin: +class SelectableMixin(abc.ABC): + """ + Inherit from this mixin, implement ``.text`` property, + get ``.selector`` property and ``.xpath`` / ``.css`` methods. + """ + __cached_selector = None + + @property + @abc.abstractmethod + def text(self) -> str: + raise NotImplementedError() + + # XXX: see https://github.com/python/mypy/issues/1362 + @property # type: ignore + def selector(self) -> parsel.Selector: + """Cached instance of :external:class:`parsel.selector.Selector`.""" + # XXX: caching is implemented in a manual way to avoid issues with + # non-hashable classes, where memoizemethod_noargs doesn't work + if self.__cached_selector is not None: + return self.__cached_selector + # XXX: should we pass base_url=self.url, as Scrapy does? + sel = parsel.Selector(text=self.text) + self.__cached_selector = sel + return sel + + def xpath(self, query, **kwargs): + """A shortcut to ``.selector.xpath()``.""" + return self.selector.xpath(query, **kwargs) + + def css(self, query): + """A shortcut to ``.selector.css()``.""" + return self.selector.css(query) + + +# TODO: when dropping Python 3.7 support, +# fix untyped ResponseShortcutsMixin.response using typing.Protocol + +class ResponseShortcutsMixin(SelectableMixin): """Common shortcut methods for working with HTML responses. + This mixin could be used with Page Object base classes. It requires "response" attribute to be present. """ @@ -19,22 +58,13 @@ def url(self): @property def html(self): """Shortcut to HTML Response's content.""" + # required for backwards compatibility; todo: deprecate return self.response.text @property - def selector(self) -> parsel.Selector: - """``parsel.Selector`` instance for the HTML Response.""" - # TODO: when dropping Python 3.7 support, - # implement it using typing.Protocol - return self.response.selector # type: ignore - - def xpath(self, query, **kwargs): - """Run an XPath query on a response, using :class:`parsel.Selector`.""" - return self.selector.xpath(query, **kwargs) - - def css(self, query): - """Run a CSS query on a response, using :class:`parsel.Selector`.""" - return self.selector.css(query) + def text(self) -> str: + # required for SelectableMixin + return self.html @property def base_url(self) -> str: diff --git a/web_poet/page_inputs/http.py b/web_poet/page_inputs/http.py index e7ef2aca..94aeb6c4 100644 --- a/web_poet/page_inputs/http.py +++ b/web_poet/page_inputs/http.py @@ -1,8 +1,7 @@ -import attrs import json -import parsel from typing import Optional, Dict, List, Type, TypeVar, Union, Tuple, AnyStr +import attrs from w3lib.encoding import ( html_to_unicode, html_body_declared_encoding, @@ -12,6 +11,7 @@ from web_poet._base import _HttpHeaders from web_poet.utils import memoizemethod_noargs +from web_poet.mixins import SelectableMixin T_headers = TypeVar("T_headers", bound="HttpResponseHeaders") @@ -163,7 +163,7 @@ class HttpRequest: @attrs.define(auto_attribs=False, slots=False, eq=False) -class HttpResponse: +class HttpResponse(SelectableMixin): """A container for the contents of a response, downloaded directly using an HTTP client. @@ -223,22 +223,6 @@ def encoding(self): or self._body_inferred_encoding() ) - # XXX: see https://github.com/python/mypy/issues/1362 - @property # type: ignore - @memoizemethod_noargs - def selector(self) -> parsel.Selector: - """Cached instance of :external:class:`parsel.selector.Selector`.""" - # XXX: should we pass base_url=self.url, as Scrapy does? - return parsel.Selector(text=self.text) - - def xpath(self, query, **kwargs): - """A shortcut to ``HttpResponse.selector.xpath()``.""" - return self.selector.xpath(query, **kwargs) - - def css(self, query): - """A shortcut to ``HttpResponse.selector.css()``.""" - return self.selector.css(query) - @memoizemethod_noargs def json(self): """ Deserialize a JSON document to a Python object. """ From 10b1570f2c7140e5db9c2c6bb59176e71d37e728 Mon Sep 17 00:00:00 2001 From: Mikhail Korobov Date: Sun, 22 May 2022 23:28:50 +0500 Subject: [PATCH 2/7] SelectorMixin: require explicit _selector_input function to be implemented This allows to pick a public-facing name which fits better in different cases (e.g. .html or .text) --- web_poet/mixins.py | 14 +++++--------- web_poet/page_inputs/http.py | 3 +++ 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/web_poet/mixins.py b/web_poet/mixins.py index ca03a719..c0573820 100644 --- a/web_poet/mixins.py +++ b/web_poet/mixins.py @@ -7,15 +7,14 @@ class SelectableMixin(abc.ABC): """ - Inherit from this mixin, implement ``.text`` property, + Inherit from this mixin, implement ``._selector_input`` method, get ``.selector`` property and ``.xpath`` / ``.css`` methods. """ __cached_selector = None - @property @abc.abstractmethod - def text(self) -> str: - raise NotImplementedError() + def _selector_input(self) -> str: + raise NotImplementedError() # pragma: nocover # XXX: see https://github.com/python/mypy/issues/1362 @property # type: ignore @@ -26,7 +25,7 @@ def selector(self) -> parsel.Selector: if self.__cached_selector is not None: return self.__cached_selector # XXX: should we pass base_url=self.url, as Scrapy does? - sel = parsel.Selector(text=self.text) + sel = parsel.Selector(text=self._selector_input()) self.__cached_selector = sel return sel @@ -58,12 +57,9 @@ def url(self): @property def html(self): """Shortcut to HTML Response's content.""" - # required for backwards compatibility; todo: deprecate return self.response.text - @property - def text(self) -> str: - # required for SelectableMixin + def _selector_input(self) -> str: return self.html @property diff --git a/web_poet/page_inputs/http.py b/web_poet/page_inputs/http.py index 94aeb6c4..68c3071b 100644 --- a/web_poet/page_inputs/http.py +++ b/web_poet/page_inputs/http.py @@ -213,6 +213,9 @@ def text(self) -> str: self._cached_text = text return self._cached_text + def _selector_input(self) -> str: + return self.text + @property def encoding(self): """ Encoding of the response """ From d6c323120abd2b1be5426702829eff5da74a7f38 Mon Sep 17 00:00:00 2001 From: Mikhail Korobov Date: Sun, 22 May 2022 23:29:42 +0500 Subject: [PATCH 3/7] BrowserHtml --- tests/test_page_inputs.py | 13 +++++++++++++ web_poet/page_inputs/__init__.py | 1 + web_poet/page_inputs/browser.py | 10 ++++++++++ 3 files changed, 24 insertions(+) create mode 100644 web_poet/page_inputs/browser.py diff --git a/tests/test_page_inputs.py b/tests/test_page_inputs.py index bc72e5b2..65934a10 100644 --- a/tests/test_page_inputs.py +++ b/tests/test_page_inputs.py @@ -4,6 +4,7 @@ import pytest import requests +import parsel from web_poet.page_inputs import ( HttpRequest, HttpResponse, @@ -11,6 +12,7 @@ HttpResponseBody, HttpRequestHeaders, HttpResponseHeaders, + BrowserHtml, ) @@ -421,3 +423,14 @@ def test_html5_meta_charset(): response = HttpResponse("http://www.example.com", body=body) assert response.encoding == 'gb18030' assert response.text == body.decode('gb18030') + + +def test_browser_html(): + src = "

Hello,

world!

" + html = BrowserHtml(src) + assert html == src + assert html != "foo" + + assert html.xpath("//p/text()").getall() == ["Hello, ", "world!"] + assert html.css("p::text").getall() == ["Hello, ", "world!"] + assert isinstance(html.selector, parsel.Selector) diff --git a/web_poet/page_inputs/__init__.py b/web_poet/page_inputs/__init__.py index 9491a5c0..129d04a6 100644 --- a/web_poet/page_inputs/__init__.py +++ b/web_poet/page_inputs/__init__.py @@ -8,3 +8,4 @@ HttpRequestBody, HttpResponseBody, ) +from .browser import BrowserHtml diff --git a/web_poet/page_inputs/browser.py b/web_poet/page_inputs/browser.py new file mode 100644 index 00000000..f12fca2d --- /dev/null +++ b/web_poet/page_inputs/browser.py @@ -0,0 +1,10 @@ +from web_poet.mixins import SelectableMixin + + +class BrowserHtml(str, SelectableMixin): + """ HTML returned by a web browser, + i.e. snapshot of the DOM tree in an HTML format. + """ + def _selector_input(self) -> str: + return self + From e4ace93761d67f9ff134e07be4cd2e5cf0dccf42 Mon Sep 17 00:00:00 2001 From: Mikhail Korobov Date: Tue, 24 May 2022 13:49:17 +0500 Subject: [PATCH 4/7] skipping type check is no longer needed for .selector attribute It was required before because property was applied to a decorated method. --- web_poet/mixins.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/web_poet/mixins.py b/web_poet/mixins.py index c0573820..faf6c0f6 100644 --- a/web_poet/mixins.py +++ b/web_poet/mixins.py @@ -16,8 +16,7 @@ class SelectableMixin(abc.ABC): def _selector_input(self) -> str: raise NotImplementedError() # pragma: nocover - # XXX: see https://github.com/python/mypy/issues/1362 - @property # type: ignore + @property def selector(self) -> parsel.Selector: """Cached instance of :external:class:`parsel.selector.Selector`.""" # XXX: caching is implemented in a manual way to avoid issues with From ed3af666028eaf9945f18e41d9524b2f5e158814 Mon Sep 17 00:00:00 2001 From: Mikhail Korobov Date: Thu, 26 May 2022 17:38:59 +0500 Subject: [PATCH 5/7] expose web_poet.BrowserHtml --- web_poet/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/web_poet/__init__.py b/web_poet/__init__.py index 03943893..d5699a97 100644 --- a/web_poet/__init__.py +++ b/web_poet/__init__.py @@ -9,6 +9,7 @@ HttpResponseHeaders, HttpRequestBody, HttpResponseBody, + BrowserHtml, ) from .overrides import PageObjectRegistry, consume_modules, OverrideRule From 0010b351a019c72600381b51139abd5de6e18728 Mon Sep 17 00:00:00 2001 From: Mikhail Korobov Date: Thu, 26 May 2022 23:53:31 +0500 Subject: [PATCH 6/7] switch mixin position MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Adrián Chaves --- web_poet/page_inputs/browser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/web_poet/page_inputs/browser.py b/web_poet/page_inputs/browser.py index f12fca2d..2be2a9e4 100644 --- a/web_poet/page_inputs/browser.py +++ b/web_poet/page_inputs/browser.py @@ -1,7 +1,7 @@ from web_poet.mixins import SelectableMixin -class BrowserHtml(str, SelectableMixin): +class BrowserHtml(SelectableMixin, str): """ HTML returned by a web browser, i.e. snapshot of the DOM tree in an HTML format. """ From df68a248d827434e89ebed29ce1cd42b91ee427f Mon Sep 17 00:00:00 2001 From: Mikhail Korobov Date: Thu, 26 May 2022 23:53:42 +0500 Subject: [PATCH 7/7] Update web_poet/page_inputs/browser.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Adrián Chaves --- web_poet/page_inputs/browser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/web_poet/page_inputs/browser.py b/web_poet/page_inputs/browser.py index 2be2a9e4..a4096298 100644 --- a/web_poet/page_inputs/browser.py +++ b/web_poet/page_inputs/browser.py @@ -3,7 +3,7 @@ class BrowserHtml(SelectableMixin, str): """ HTML returned by a web browser, - i.e. snapshot of the DOM tree in an HTML format. + i.e. snapshot of the DOM tree in HTML format. """ def _selector_input(self) -> str: return self